evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +147 -9
- evalvault/adapters/inbound/api/routers/config.py +6 -1
- evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/methods/external_command.py +22 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
- evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
- evalvault/config/secret_manager.py +118 -0
- evalvault/config/settings.py +141 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,14 +2,59 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict, deque
|
|
5
9
|
from contextlib import asynccontextmanager
|
|
6
10
|
from typing import Annotated
|
|
7
11
|
|
|
8
|
-
from fastapi import Depends, FastAPI, Request
|
|
12
|
+
from fastapi import Depends, FastAPI, HTTPException, Request, Security
|
|
9
13
|
from fastapi.middleware.cors import CORSMiddleware
|
|
14
|
+
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
15
|
+
from starlette.responses import JSONResponse
|
|
10
16
|
|
|
11
17
|
from evalvault.adapters.inbound.api.adapter import WebUIAdapter, create_adapter
|
|
12
|
-
from evalvault.config.settings import get_settings
|
|
18
|
+
from evalvault.config.settings import Settings, get_settings, is_production_profile
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RateLimiter:
|
|
24
|
+
def __init__(self) -> None:
|
|
25
|
+
self._requests: dict[str, deque[float]] = defaultdict(deque)
|
|
26
|
+
self._blocked_counts: dict[str, int] = defaultdict(int)
|
|
27
|
+
|
|
28
|
+
def check(self, key: str, limit: int, window_seconds: int) -> tuple[bool, int | None, int]:
|
|
29
|
+
now = time.monotonic()
|
|
30
|
+
window = max(window_seconds, 1)
|
|
31
|
+
queue = self._requests[key]
|
|
32
|
+
while queue and now - queue[0] >= window:
|
|
33
|
+
queue.popleft()
|
|
34
|
+
if len(queue) >= limit:
|
|
35
|
+
self._blocked_counts[key] += 1
|
|
36
|
+
retry_after = int(window - (now - queue[0])) if queue else window
|
|
37
|
+
return False, max(retry_after, 1), self._blocked_counts[key]
|
|
38
|
+
queue.append(now)
|
|
39
|
+
return True, None, self._blocked_counts[key]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
rate_limiter = RateLimiter()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _hash_token(token: str) -> str:
|
|
46
|
+
return hashlib.sha256(token.encode("utf-8")).hexdigest()[:8]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _rate_limit_key(request: Request) -> str:
|
|
50
|
+
auth_header = request.headers.get("Authorization", "")
|
|
51
|
+
if auth_header.lower().startswith("bearer "):
|
|
52
|
+
token = auth_header[7:].strip()
|
|
53
|
+
if token:
|
|
54
|
+
return f"token:{_hash_token(token)}"
|
|
55
|
+
client = request.client
|
|
56
|
+
host = client.host if client else "unknown"
|
|
57
|
+
return f"ip:{host}"
|
|
13
58
|
|
|
14
59
|
|
|
15
60
|
@asynccontextmanager
|
|
@@ -23,6 +68,31 @@ async def lifespan(app: FastAPI):
|
|
|
23
68
|
pass
|
|
24
69
|
|
|
25
70
|
|
|
71
|
+
auth_scheme = HTTPBearer(auto_error=False)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _normalize_api_tokens(raw_tokens: str | None) -> set[str]:
|
|
75
|
+
if not raw_tokens:
|
|
76
|
+
return set()
|
|
77
|
+
return {token.strip() for token in raw_tokens.split(",") if token.strip()}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def require_api_token(
|
|
81
|
+
credentials: Annotated[HTTPAuthorizationCredentials | None, Security(auth_scheme)],
|
|
82
|
+
settings: Settings = Depends(get_settings),
|
|
83
|
+
) -> str | None:
|
|
84
|
+
tokens = _normalize_api_tokens(settings.api_auth_tokens)
|
|
85
|
+
if not tokens:
|
|
86
|
+
return None
|
|
87
|
+
if credentials is None or credentials.credentials not in tokens:
|
|
88
|
+
raise HTTPException(
|
|
89
|
+
status_code=401,
|
|
90
|
+
detail="Invalid or missing API token",
|
|
91
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
92
|
+
)
|
|
93
|
+
return credentials.credentials
|
|
94
|
+
|
|
95
|
+
|
|
26
96
|
def create_app() -> FastAPI:
|
|
27
97
|
"""Create and configure the FastAPI application."""
|
|
28
98
|
app = FastAPI(
|
|
@@ -32,10 +102,46 @@ def create_app() -> FastAPI:
|
|
|
32
102
|
lifespan=lifespan,
|
|
33
103
|
)
|
|
34
104
|
|
|
105
|
+
@app.middleware("http")
|
|
106
|
+
async def rate_limit_middleware(request: Request, call_next):
|
|
107
|
+
settings = get_settings()
|
|
108
|
+
if not settings.rate_limit_enabled:
|
|
109
|
+
return await call_next(request)
|
|
110
|
+
if not request.url.path.startswith("/api/"):
|
|
111
|
+
return await call_next(request)
|
|
112
|
+
limit = max(settings.rate_limit_requests, 1)
|
|
113
|
+
window_seconds = max(settings.rate_limit_window_seconds, 1)
|
|
114
|
+
key = _rate_limit_key(request)
|
|
115
|
+
allowed, retry_after, blocked_count = rate_limiter.check(
|
|
116
|
+
key,
|
|
117
|
+
limit,
|
|
118
|
+
window_seconds,
|
|
119
|
+
)
|
|
120
|
+
if not allowed:
|
|
121
|
+
if blocked_count >= settings.rate_limit_block_threshold:
|
|
122
|
+
logger.warning(
|
|
123
|
+
"Rate limit blocked request",
|
|
124
|
+
extra={
|
|
125
|
+
"rate_limit_key": key,
|
|
126
|
+
"blocked_count": blocked_count,
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
headers = {"Retry-After": str(retry_after)} if retry_after else None
|
|
130
|
+
return JSONResponse(
|
|
131
|
+
status_code=429,
|
|
132
|
+
content={"detail": "Rate limit exceeded"},
|
|
133
|
+
headers=headers,
|
|
134
|
+
)
|
|
135
|
+
return await call_next(request)
|
|
136
|
+
|
|
35
137
|
settings = get_settings()
|
|
36
138
|
cors_origins = [
|
|
37
139
|
origin.strip() for origin in (settings.cors_origins or "").split(",") if origin.strip()
|
|
38
|
-
]
|
|
140
|
+
]
|
|
141
|
+
if not cors_origins:
|
|
142
|
+
if is_production_profile(settings.evalvault_profile):
|
|
143
|
+
raise RuntimeError("CORS_ORIGINS must be set for production profile.")
|
|
144
|
+
cors_origins = ["http://localhost:5173"]
|
|
39
145
|
|
|
40
146
|
# Configure CORS
|
|
41
147
|
app.add_middleware(
|
|
@@ -48,12 +154,44 @@ def create_app() -> FastAPI:
|
|
|
48
154
|
|
|
49
155
|
from .routers import benchmark, config, domain, knowledge, pipeline, runs
|
|
50
156
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
app.include_router(
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
157
|
+
auth_dependencies = [Depends(require_api_token)]
|
|
158
|
+
|
|
159
|
+
app.include_router(
|
|
160
|
+
runs.router,
|
|
161
|
+
prefix="/api/v1/runs",
|
|
162
|
+
tags=["runs"],
|
|
163
|
+
dependencies=auth_dependencies,
|
|
164
|
+
)
|
|
165
|
+
app.include_router(
|
|
166
|
+
benchmark.router,
|
|
167
|
+
prefix="/api/v1/benchmarks",
|
|
168
|
+
tags=["benchmarks"],
|
|
169
|
+
dependencies=auth_dependencies,
|
|
170
|
+
)
|
|
171
|
+
app.include_router(
|
|
172
|
+
knowledge.router,
|
|
173
|
+
prefix="/api/v1/knowledge",
|
|
174
|
+
tags=["knowledge"],
|
|
175
|
+
dependencies=auth_dependencies,
|
|
176
|
+
)
|
|
177
|
+
app.include_router(
|
|
178
|
+
pipeline.router,
|
|
179
|
+
prefix="/api/v1/pipeline",
|
|
180
|
+
tags=["pipeline"],
|
|
181
|
+
dependencies=auth_dependencies,
|
|
182
|
+
)
|
|
183
|
+
app.include_router(
|
|
184
|
+
domain.router,
|
|
185
|
+
prefix="/api/v1/domain",
|
|
186
|
+
tags=["domain"],
|
|
187
|
+
dependencies=auth_dependencies,
|
|
188
|
+
)
|
|
189
|
+
app.include_router(
|
|
190
|
+
config.router,
|
|
191
|
+
prefix="/api/v1/config",
|
|
192
|
+
tags=["config"],
|
|
193
|
+
dependencies=auth_dependencies,
|
|
194
|
+
)
|
|
57
195
|
|
|
58
196
|
@app.get("/health")
|
|
59
197
|
def health_check():
|
|
@@ -28,6 +28,9 @@ def get_config():
|
|
|
28
28
|
"phoenix_api_token",
|
|
29
29
|
"postgres_password",
|
|
30
30
|
"postgres_connection_string",
|
|
31
|
+
"api_auth_tokens",
|
|
32
|
+
"knowledge_read_tokens",
|
|
33
|
+
"knowledge_write_tokens",
|
|
31
34
|
}
|
|
32
35
|
)
|
|
33
36
|
|
|
@@ -80,7 +83,6 @@ def update_config(
|
|
|
80
83
|
payload: ConfigUpdateRequest,
|
|
81
84
|
adapter: AdapterDep,
|
|
82
85
|
):
|
|
83
|
-
"""Update runtime configuration (non-secret fields only)."""
|
|
84
86
|
updates = payload.model_dump(exclude_unset=True)
|
|
85
87
|
if not updates:
|
|
86
88
|
return get_config()
|
|
@@ -96,6 +98,9 @@ def update_config(
|
|
|
96
98
|
"phoenix_api_token",
|
|
97
99
|
"postgres_password",
|
|
98
100
|
"postgres_connection_string",
|
|
101
|
+
"api_auth_tokens",
|
|
102
|
+
"knowledge_read_tokens",
|
|
103
|
+
"knowledge_write_tokens",
|
|
99
104
|
}
|
|
100
105
|
)
|
|
101
106
|
|
|
@@ -2,10 +2,11 @@ import shutil
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from fastapi import APIRouter, BackgroundTasks, File, HTTPException, UploadFile
|
|
5
|
+
from fastapi import APIRouter, BackgroundTasks, Depends, File, HTTPException, Request, UploadFile
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
from evalvault.adapters.outbound.kg.parallel_kg_builder import ParallelKGBuilder
|
|
9
|
+
from evalvault.config.settings import Settings, get_settings
|
|
9
10
|
|
|
10
11
|
router = APIRouter(tags=["knowledge"])
|
|
11
12
|
|
|
@@ -18,6 +19,47 @@ KG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
18
19
|
KG_JOBS: dict[str, dict[str, Any]] = {}
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
def _normalize_tokens(raw_tokens: str | None) -> set[str]:
|
|
23
|
+
if not raw_tokens:
|
|
24
|
+
return set()
|
|
25
|
+
return {token.strip() for token in raw_tokens.split(",") if token.strip()}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _extract_bearer_token(request: Request) -> str | None:
|
|
29
|
+
auth_header = request.headers.get("Authorization", "")
|
|
30
|
+
if not auth_header:
|
|
31
|
+
return None
|
|
32
|
+
prefix = "bearer "
|
|
33
|
+
if auth_header.lower().startswith(prefix):
|
|
34
|
+
return auth_header[len(prefix) :].strip()
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _require_knowledge_read_token(
|
|
39
|
+
request: Request,
|
|
40
|
+
settings: Settings = Depends(get_settings),
|
|
41
|
+
) -> None:
|
|
42
|
+
read_tokens = _normalize_tokens(settings.knowledge_read_tokens)
|
|
43
|
+
write_tokens = _normalize_tokens(settings.knowledge_write_tokens)
|
|
44
|
+
if not read_tokens and not write_tokens:
|
|
45
|
+
return
|
|
46
|
+
token = _extract_bearer_token(request)
|
|
47
|
+
if token is None or token not in (read_tokens | write_tokens):
|
|
48
|
+
raise HTTPException(status_code=403, detail="Invalid or missing knowledge read token")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _require_knowledge_write_token(
|
|
52
|
+
request: Request,
|
|
53
|
+
settings: Settings = Depends(get_settings),
|
|
54
|
+
) -> None:
|
|
55
|
+
write_tokens = _normalize_tokens(settings.knowledge_write_tokens)
|
|
56
|
+
if not write_tokens:
|
|
57
|
+
return
|
|
58
|
+
token = _extract_bearer_token(request)
|
|
59
|
+
if token is None or token not in write_tokens:
|
|
60
|
+
raise HTTPException(status_code=403, detail="Invalid or missing knowledge write token")
|
|
61
|
+
|
|
62
|
+
|
|
21
63
|
class BuildKGRequest(BaseModel):
|
|
22
64
|
workers: int = 4
|
|
23
65
|
batch_size: int = 32
|
|
@@ -26,7 +68,10 @@ class BuildKGRequest(BaseModel):
|
|
|
26
68
|
|
|
27
69
|
|
|
28
70
|
@router.post("/upload")
|
|
29
|
-
async def upload_files(
|
|
71
|
+
async def upload_files(
|
|
72
|
+
files: list[UploadFile] = File(...),
|
|
73
|
+
_: None = Depends(_require_knowledge_write_token),
|
|
74
|
+
):
|
|
30
75
|
"""Upload documents for Knowledge Graph building."""
|
|
31
76
|
uploaded = []
|
|
32
77
|
for file in files:
|
|
@@ -40,7 +85,9 @@ async def upload_files(files: list[UploadFile] = File(...)):
|
|
|
40
85
|
|
|
41
86
|
|
|
42
87
|
@router.get("/files")
|
|
43
|
-
def list_files(
|
|
88
|
+
def list_files(
|
|
89
|
+
_: None = Depends(_require_knowledge_read_token),
|
|
90
|
+
):
|
|
44
91
|
"""List uploaded files."""
|
|
45
92
|
files = []
|
|
46
93
|
if DATA_DIR.exists():
|
|
@@ -49,7 +96,11 @@ def list_files():
|
|
|
49
96
|
|
|
50
97
|
|
|
51
98
|
@router.post("/build", status_code=202)
|
|
52
|
-
async def build_knowledge_graph(
|
|
99
|
+
async def build_knowledge_graph(
|
|
100
|
+
request: BuildKGRequest,
|
|
101
|
+
background_tasks: BackgroundTasks,
|
|
102
|
+
_: None = Depends(_require_knowledge_write_token),
|
|
103
|
+
):
|
|
53
104
|
"""Trigger background Knowledge Graph construction."""
|
|
54
105
|
job_id = f"kg_build_{len(KG_JOBS) + 1}"
|
|
55
106
|
KG_JOBS[job_id] = {"status": "pending", "progress": "0%", "details": "Queued"}
|
|
@@ -121,7 +172,10 @@ async def build_knowledge_graph(request: BuildKGRequest, background_tasks: Backg
|
|
|
121
172
|
|
|
122
173
|
|
|
123
174
|
@router.get("/jobs/{job_id}")
|
|
124
|
-
def get_job_status(
|
|
175
|
+
def get_job_status(
|
|
176
|
+
job_id: str,
|
|
177
|
+
_: None = Depends(_require_knowledge_read_token),
|
|
178
|
+
):
|
|
125
179
|
job = KG_JOBS.get(job_id)
|
|
126
180
|
if not job:
|
|
127
181
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
@@ -129,7 +183,9 @@ def get_job_status(job_id: str):
|
|
|
129
183
|
|
|
130
184
|
|
|
131
185
|
@router.get("/stats")
|
|
132
|
-
def get_graph_stats(
|
|
186
|
+
def get_graph_stats(
|
|
187
|
+
_: None = Depends(_require_knowledge_read_token),
|
|
188
|
+
):
|
|
133
189
|
"""Get statistics of the built Knowledge Graph."""
|
|
134
190
|
# Try to load from memory DB or default output JSON
|
|
135
191
|
# For now, we'll try to load the JSON if it exists, or just return empty
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import Callable
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import typer
|
|
10
10
|
from rich.console import Console
|
|
@@ -12,8 +12,11 @@ from rich.console import Console
|
|
|
12
12
|
from .agent import register_agent_commands
|
|
13
13
|
from .analyze import register_analyze_commands
|
|
14
14
|
from .api import register_api_command
|
|
15
|
+
from .artifacts import create_artifacts_app
|
|
15
16
|
from .benchmark import create_benchmark_app
|
|
16
17
|
from .calibrate import register_calibrate_commands
|
|
18
|
+
from .calibrate_judge import register_calibrate_judge_commands
|
|
19
|
+
from .compare import register_compare_commands
|
|
17
20
|
from .config import register_config_commands
|
|
18
21
|
from .debug import create_debug_app
|
|
19
22
|
from .domain import create_domain_app
|
|
@@ -25,19 +28,17 @@ from .init import register_init_command
|
|
|
25
28
|
from .kg import create_kg_app
|
|
26
29
|
from .langfuse import register_langfuse_commands
|
|
27
30
|
from .method import create_method_app
|
|
31
|
+
from .ops import create_ops_app
|
|
28
32
|
from .phoenix import create_phoenix_app
|
|
29
33
|
from .pipeline import register_pipeline_commands
|
|
34
|
+
from .profile_difficulty import register_profile_difficulty_commands
|
|
30
35
|
from .prompts import create_prompts_app
|
|
36
|
+
from .regress import register_regress_commands
|
|
31
37
|
from .run import register_run_commands
|
|
32
38
|
from .stage import create_stage_app
|
|
33
39
|
|
|
34
40
|
CommandFactory = Callable[[Console], typer.Typer]
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class CommandRegistrar(Protocol):
|
|
38
|
-
"""Callable protocol for Typer command registrars."""
|
|
39
|
-
|
|
40
|
-
def __call__(self, app: typer.Typer, console: Console, **kwargs: Any) -> None: ...
|
|
41
|
+
CommandRegistrar = Callable[..., Any]
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
@dataclass(frozen=True)
|
|
@@ -61,10 +62,14 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
|
|
|
61
62
|
CommandModule(register_run_commands, needs_metrics=True),
|
|
62
63
|
CommandModule(register_pipeline_commands),
|
|
63
64
|
CommandModule(register_history_commands),
|
|
65
|
+
CommandModule(register_compare_commands),
|
|
64
66
|
CommandModule(register_analyze_commands),
|
|
65
67
|
CommandModule(register_calibrate_commands),
|
|
68
|
+
CommandModule(register_calibrate_judge_commands),
|
|
66
69
|
CommandModule(register_generate_commands),
|
|
67
70
|
CommandModule(register_gate_commands),
|
|
71
|
+
CommandModule(register_profile_difficulty_commands, needs_metrics=True),
|
|
72
|
+
CommandModule(register_regress_commands),
|
|
68
73
|
CommandModule(register_agent_commands),
|
|
69
74
|
CommandModule(register_experiment_commands),
|
|
70
75
|
CommandModule(register_config_commands),
|
|
@@ -78,9 +83,11 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
|
|
|
78
83
|
SubAppModule("domain", create_domain_app),
|
|
79
84
|
SubAppModule("benchmark", create_benchmark_app),
|
|
80
85
|
SubAppModule("method", create_method_app),
|
|
86
|
+
SubAppModule("ops", create_ops_app),
|
|
81
87
|
SubAppModule("phoenix", create_phoenix_app),
|
|
82
88
|
SubAppModule("prompts", create_prompts_app),
|
|
83
89
|
SubAppModule("stage", create_stage_app),
|
|
90
|
+
SubAppModule("artifacts", create_artifacts_app),
|
|
84
91
|
SubAppModule("debug", create_debug_app),
|
|
85
92
|
)
|
|
86
93
|
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.cli.utils.console import print_cli_error
|
|
11
|
+
from evalvault.adapters.inbound.cli.utils.validators import validate_choice
|
|
12
|
+
from evalvault.adapters.outbound.artifact_fs import LocalArtifactFileSystemAdapter
|
|
13
|
+
from evalvault.domain.services.artifact_lint_service import ArtifactLintService
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_artifacts_app(console: Console) -> typer.Typer:
|
|
19
|
+
artifacts_app = typer.Typer(name="artifacts", help="Artifact utilities.")
|
|
20
|
+
|
|
21
|
+
@artifacts_app.command("lint")
|
|
22
|
+
def lint(
|
|
23
|
+
artifacts_dir: Path = typer.Argument(..., help="Artifacts directory."),
|
|
24
|
+
strict: bool = typer.Option(False, "--strict", help="Fail on missing files."),
|
|
25
|
+
output_format: str = typer.Option(
|
|
26
|
+
"json",
|
|
27
|
+
"--format",
|
|
28
|
+
"-f",
|
|
29
|
+
help="Output format (json).",
|
|
30
|
+
),
|
|
31
|
+
output: Path | None = typer.Option(
|
|
32
|
+
None,
|
|
33
|
+
"--output",
|
|
34
|
+
"-o",
|
|
35
|
+
help="Output file path for lint result.",
|
|
36
|
+
),
|
|
37
|
+
parallel: bool = typer.Option(
|
|
38
|
+
True,
|
|
39
|
+
"--parallel/--no-parallel",
|
|
40
|
+
help="Enable parallel validation (placeholder).",
|
|
41
|
+
),
|
|
42
|
+
concurrency: int = typer.Option(
|
|
43
|
+
8,
|
|
44
|
+
"--concurrency",
|
|
45
|
+
min=1,
|
|
46
|
+
help="Parallel validation concurrency (placeholder).",
|
|
47
|
+
),
|
|
48
|
+
) -> None:
|
|
49
|
+
validate_choice(output_format, ["json"], console, value_label="format")
|
|
50
|
+
|
|
51
|
+
logger.info("Artifacts lint command started: %s", artifacts_dir)
|
|
52
|
+
fs_adapter = LocalArtifactFileSystemAdapter()
|
|
53
|
+
service = ArtifactLintService(fs_adapter)
|
|
54
|
+
summary = service.lint(artifacts_dir, strict=strict)
|
|
55
|
+
|
|
56
|
+
payload = _build_payload(summary, parallel=parallel, concurrency=concurrency)
|
|
57
|
+
if output:
|
|
58
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
60
|
+
console.print(f"[green]Lint report saved:[/green] {output}")
|
|
61
|
+
else:
|
|
62
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
63
|
+
|
|
64
|
+
if summary.status == "error":
|
|
65
|
+
logger.error("Artifacts lint command failed: %s", artifacts_dir)
|
|
66
|
+
print_cli_error(console, "Artifact lint failed", details=str(artifacts_dir))
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
|
|
69
|
+
logger.info("Artifacts lint command finished: %s", artifacts_dir)
|
|
70
|
+
|
|
71
|
+
return artifacts_app
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _build_payload(summary, *, parallel: bool, concurrency: int) -> dict[str, object]:
|
|
75
|
+
issues = [
|
|
76
|
+
{
|
|
77
|
+
"level": issue.level,
|
|
78
|
+
"code": issue.code,
|
|
79
|
+
"message": issue.message,
|
|
80
|
+
"path": issue.path,
|
|
81
|
+
}
|
|
82
|
+
for issue in summary.issues
|
|
83
|
+
]
|
|
84
|
+
error_count = sum(1 for issue in summary.issues if issue.level == "error")
|
|
85
|
+
warning_count = sum(1 for issue in summary.issues if issue.level == "warning")
|
|
86
|
+
return {
|
|
87
|
+
"command": "artifacts.lint",
|
|
88
|
+
"version": 1,
|
|
89
|
+
"status": summary.status,
|
|
90
|
+
"started_at": summary.started_at.isoformat(),
|
|
91
|
+
"finished_at": summary.finished_at.isoformat(),
|
|
92
|
+
"duration_ms": summary.duration_ms,
|
|
93
|
+
"artifacts": {
|
|
94
|
+
"dir": str(summary.artifacts_dir),
|
|
95
|
+
"index": str(summary.index_path),
|
|
96
|
+
},
|
|
97
|
+
"data": {
|
|
98
|
+
"strict": summary.strict,
|
|
99
|
+
"parallel": parallel,
|
|
100
|
+
"concurrency": concurrency,
|
|
101
|
+
"issue_counts": {
|
|
102
|
+
"error": error_count,
|
|
103
|
+
"warning": warning_count,
|
|
104
|
+
},
|
|
105
|
+
"issues": issues,
|
|
106
|
+
},
|
|
107
|
+
}
|