fleet-python 0.2.110__tar.gz → 0.2.112__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.110/fleet_python.egg-info → fleet_python-0.2.112}/PKG-INFO +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/__init__.py +9 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/client.py +14 -0
- fleet_python-0.2.112/fleet/_async/judge.py +96 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/filesystem.py +8 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/base.py +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/client.py +14 -0
- fleet_python-0.2.112/fleet/judge.py +521 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/filesystem.py +8 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/SOURCES.txt +2 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/pyproject.toml +1 -1
- {fleet_python-0.2.110 → fleet_python-0.2.112}/LICENSE +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/README.md +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/diff_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_account.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_sync.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_task.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/export_tasks_filtered.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/openai_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/quickstart.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/api.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/agent.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp/main.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/orchestrator.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/types.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/cli.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/config.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/env/client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/global_client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/models.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/api.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/tasks.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/types.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/scripts/unasync.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/setup.cfg +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/__init__.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_expect_exactly.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_verifier_from_string.py +0 -0
|
@@ -68,12 +68,15 @@ from .tasks import (
|
|
|
68
68
|
# Import shared types
|
|
69
69
|
from .types import VerifierFunction
|
|
70
70
|
|
|
71
|
+
# Import judge data classes
|
|
72
|
+
from .judge import Rubric, Criterion, Image, JudgeResult
|
|
73
|
+
|
|
71
74
|
# Create a module-level env attribute for convenient access
|
|
72
75
|
from . import env
|
|
73
76
|
from . import global_client as _global_client
|
|
74
77
|
from ._async import global_client as _async_global_client
|
|
75
78
|
|
|
76
|
-
__version__ = "0.2.
|
|
79
|
+
__version__ = "0.2.112"
|
|
77
80
|
|
|
78
81
|
__all__ = [
|
|
79
82
|
# Core classes
|
|
@@ -90,6 +93,11 @@ __all__ = [
|
|
|
90
93
|
# Task models
|
|
91
94
|
"Task",
|
|
92
95
|
"VerifierFunction",
|
|
96
|
+
# Judge
|
|
97
|
+
"Rubric",
|
|
98
|
+
"Criterion",
|
|
99
|
+
"Image",
|
|
100
|
+
"JudgeResult",
|
|
93
101
|
# Exceptions
|
|
94
102
|
"FleetError",
|
|
95
103
|
"FleetAPIError",
|
|
@@ -54,6 +54,7 @@ from .tasks import Task
|
|
|
54
54
|
|
|
55
55
|
if TYPE_CHECKING:
|
|
56
56
|
from .verifiers import AsyncVerifierFunction
|
|
57
|
+
from .judge import AsyncJudge
|
|
57
58
|
|
|
58
59
|
|
|
59
60
|
def _json_default(x: Any) -> Any:
|
|
@@ -344,6 +345,7 @@ class AsyncEnv(EnvironmentBase):
|
|
|
344
345
|
self._client = client
|
|
345
346
|
self._apps: Dict[str, AsyncInstanceClient] = {}
|
|
346
347
|
self._instance: Optional[AsyncInstanceClient] = None
|
|
348
|
+
self._judge: Optional["AsyncJudge"] = None
|
|
347
349
|
|
|
348
350
|
@property
|
|
349
351
|
def instance(self) -> AsyncInstanceClient:
|
|
@@ -419,6 +421,18 @@ class AsyncEnv(EnvironmentBase):
|
|
|
419
421
|
mcp_url = f"{self.urls.root}mcp"
|
|
420
422
|
return AsyncMCPResource(url=mcp_url, env_key=self.env_key)
|
|
421
423
|
|
|
424
|
+
@property
|
|
425
|
+
def judge(self) -> "AsyncJudge":
|
|
426
|
+
"""LLM-as-judge grading via orchestrator API."""
|
|
427
|
+
if self._judge is None:
|
|
428
|
+
from .judge import AsyncJudge
|
|
429
|
+
|
|
430
|
+
self._judge = AsyncJudge(
|
|
431
|
+
client=self._load_client,
|
|
432
|
+
instance_id=self.instance_id,
|
|
433
|
+
)
|
|
434
|
+
return self._judge
|
|
435
|
+
|
|
422
436
|
def state(self, uri: str) -> Resource:
|
|
423
437
|
return self.instance.state(uri)
|
|
424
438
|
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Fleet SDK Judge - Async version.
|
|
2
|
+
|
|
3
|
+
Provides env.judge.grade() for async verifier scripts.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Dict, List, Optional, Union, TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
# Import shared classes and helpers from the sync module
|
|
9
|
+
from ..judge import (
|
|
10
|
+
Criterion,
|
|
11
|
+
Image,
|
|
12
|
+
JudgeResult,
|
|
13
|
+
Rubric,
|
|
14
|
+
_build_grade_request,
|
|
15
|
+
_parse_grade_response,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from .base import AsyncWrapper
|
|
20
|
+
|
|
21
|
+
# Re-export data classes so `from fleet._async.judge import ...` works
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AsyncJudge",
|
|
24
|
+
"Criterion",
|
|
25
|
+
"Image",
|
|
26
|
+
"JudgeResult",
|
|
27
|
+
"Rubric",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AsyncJudge:
|
|
32
|
+
"""LLM-as-judge grading — calls orchestrator API, not environment API.
|
|
33
|
+
|
|
34
|
+
Accessed as env.judge on AsyncEnv instances.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, client: "AsyncWrapper", instance_id: str):
|
|
38
|
+
self._client = client
|
|
39
|
+
self._instance_id = instance_id
|
|
40
|
+
|
|
41
|
+
async def grade(
|
|
42
|
+
self,
|
|
43
|
+
rubric: Union[str, Rubric],
|
|
44
|
+
submission: Optional[str] = None,
|
|
45
|
+
*,
|
|
46
|
+
ground_truth: Optional[Union[str, dict]] = None,
|
|
47
|
+
problem: Optional[str] = None,
|
|
48
|
+
context: Optional[str] = None,
|
|
49
|
+
reference_claims: Optional[str] = None,
|
|
50
|
+
conversation: Optional[List[dict]] = None,
|
|
51
|
+
images: Optional[Dict[str, Image]] = None,
|
|
52
|
+
model: Optional[str] = None,
|
|
53
|
+
provider: Optional[str] = None,
|
|
54
|
+
agentic: bool = False,
|
|
55
|
+
collect: Optional[Dict[str, List[str]]] = None,
|
|
56
|
+
task_id: Optional[str] = None,
|
|
57
|
+
) -> JudgeResult:
|
|
58
|
+
"""Grade a submission using LLM-as-judge via the orchestrator API.
|
|
59
|
+
|
|
60
|
+
Returns a JudgeResult (float subclass with .details, .criteria, .feedback)
|
|
61
|
+
that can be returned directly from a verifier function.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
rubric: Grading rubric — either a string or a structured Rubric object.
|
|
65
|
+
submission: The agent's final answer / submission text.
|
|
66
|
+
ground_truth: Expected answer (string or dict).
|
|
67
|
+
problem: The original problem statement.
|
|
68
|
+
context: Additional context for the judge.
|
|
69
|
+
reference_claims: Reference analysis claims.
|
|
70
|
+
conversation: Conversation history as list of message dicts.
|
|
71
|
+
images: Named images for the judge (e.g., gold reference, agent output).
|
|
72
|
+
model: Override LLM model (server picks default if None).
|
|
73
|
+
provider: Override LLM provider (server picks default if None).
|
|
74
|
+
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
75
|
+
collect: File patterns for orchestrator to collect (agentic mode).
|
|
76
|
+
task_id: Optional task ID for tracking.
|
|
77
|
+
"""
|
|
78
|
+
body = _build_grade_request(
|
|
79
|
+
self._instance_id,
|
|
80
|
+
rubric,
|
|
81
|
+
submission,
|
|
82
|
+
ground_truth=ground_truth,
|
|
83
|
+
problem=problem,
|
|
84
|
+
context=context,
|
|
85
|
+
reference_claims=reference_claims,
|
|
86
|
+
conversation=conversation,
|
|
87
|
+
images=images,
|
|
88
|
+
model=model,
|
|
89
|
+
provider=provider,
|
|
90
|
+
agentic=agentic,
|
|
91
|
+
collect=collect,
|
|
92
|
+
task_id=task_id,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
response = await self._client.request("POST", "/v1/judge/grade", json=body)
|
|
96
|
+
return _parse_grade_response(response.json())
|
|
@@ -301,6 +301,14 @@ class AsyncFilesystemResource(Resource):
|
|
|
301
301
|
response = await self.client.request(
|
|
302
302
|
"POST", "/fs/file", json=request.model_dump()
|
|
303
303
|
)
|
|
304
|
+
if response.status_code == 404:
|
|
305
|
+
return FileStateResponse(
|
|
306
|
+
success=True, path=path, exists=False,
|
|
307
|
+
message=response.json().get("detail", "File not found"),
|
|
308
|
+
)
|
|
309
|
+
if response.status_code >= 400:
|
|
310
|
+
detail = response.json().get("detail", response.text)
|
|
311
|
+
raise RuntimeError(f"Failed to get file state for '{path}': {detail}")
|
|
304
312
|
return FileStateResponse(**response.json())
|
|
305
313
|
|
|
306
314
|
async def file_text(self, path: str, max_content_size: int = 102400) -> str:
|
|
@@ -59,6 +59,7 @@ from .tasks import Task
|
|
|
59
59
|
|
|
60
60
|
if TYPE_CHECKING:
|
|
61
61
|
from .verifiers import SyncVerifierFunction
|
|
62
|
+
from .judge import SyncJudge
|
|
62
63
|
|
|
63
64
|
|
|
64
65
|
def _json_default(x: Any) -> Any:
|
|
@@ -348,6 +349,7 @@ class SyncEnv(EnvironmentBase):
|
|
|
348
349
|
self._client = client
|
|
349
350
|
self._apps: Dict[str, InstanceClient] = {}
|
|
350
351
|
self._instance: Optional[InstanceClient] = None
|
|
352
|
+
self._judge: Optional["SyncJudge"] = None
|
|
351
353
|
self._manager_url_override: Optional[str] = None # For URL mode
|
|
352
354
|
|
|
353
355
|
@property
|
|
@@ -431,6 +433,18 @@ class SyncEnv(EnvironmentBase):
|
|
|
431
433
|
mcp_url = f"{self.urls.root}mcp"
|
|
432
434
|
return SyncMCPResource(url=mcp_url, env_key=self.env_key)
|
|
433
435
|
|
|
436
|
+
@property
|
|
437
|
+
def judge(self) -> "SyncJudge":
|
|
438
|
+
"""LLM-as-judge grading via orchestrator API."""
|
|
439
|
+
if self._judge is None:
|
|
440
|
+
from .judge import SyncJudge
|
|
441
|
+
|
|
442
|
+
self._judge = SyncJudge(
|
|
443
|
+
client=self._load_client,
|
|
444
|
+
instance_id=self.instance_id,
|
|
445
|
+
)
|
|
446
|
+
return self._judge
|
|
447
|
+
|
|
434
448
|
def state(self, uri: str) -> Resource:
|
|
435
449
|
return self.instance.state(uri)
|
|
436
450
|
|
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
"""Fleet SDK Judge - LLM-as-Judge grading via orchestrator API.
|
|
2
|
+
|
|
3
|
+
Provides env.judge.grade() for verifier scripts to grade submissions
|
|
4
|
+
using LLM judges without managing API keys, HTTP calls, or response parsing.
|
|
5
|
+
|
|
6
|
+
All LLM calls happen server-side on the orchestrator — the SDK just sends
|
|
7
|
+
the rubric, submission, and artifacts, and gets back a score.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from .base import SyncWrapper
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Data classes (used by both sync and async)
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _guess_media_type(filename: str) -> str:
|
|
29
|
+
"""Guess media type from filename extension."""
|
|
30
|
+
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
|
|
31
|
+
return {
|
|
32
|
+
"png": "image/png",
|
|
33
|
+
"jpg": "image/jpeg",
|
|
34
|
+
"jpeg": "image/jpeg",
|
|
35
|
+
"gif": "image/gif",
|
|
36
|
+
"webp": "image/webp",
|
|
37
|
+
"svg": "image/svg+xml",
|
|
38
|
+
}.get(ext, "image/png")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class Criterion:
|
|
43
|
+
"""A single rubric criterion for grading.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
name: Name of this criterion (e.g., "Takeaway Alignment")
|
|
47
|
+
max: Maximum points for this criterion
|
|
48
|
+
levels: Optional mapping of score -> description for each level.
|
|
49
|
+
Rendered into the description string for the API.
|
|
50
|
+
description: Optional freeform description (alternative to levels)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
name: str
|
|
54
|
+
max: int
|
|
55
|
+
levels: Optional[Dict[int, str]] = None
|
|
56
|
+
description: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
def _render_description(self) -> str:
|
|
59
|
+
"""Render levels dict + description into a single description string."""
|
|
60
|
+
parts = []
|
|
61
|
+
if self.levels:
|
|
62
|
+
for score in sorted(self.levels.keys(), reverse=True):
|
|
63
|
+
parts.append(f"- {score} points: {self.levels[score]}")
|
|
64
|
+
if self.description:
|
|
65
|
+
parts.append(self.description)
|
|
66
|
+
return "\n".join(parts) if parts else self.name
|
|
67
|
+
|
|
68
|
+
def serialize(self) -> dict:
|
|
69
|
+
return {
|
|
70
|
+
"name": self.name,
|
|
71
|
+
"max_score": self.max,
|
|
72
|
+
"description": self._render_description(),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class Rubric:
|
|
78
|
+
"""Structured grading rubric.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
criteria: List of Criterion objects
|
|
82
|
+
system_prompt: Optional override for the judge system prompt
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
criteria: List[Criterion] = field(default_factory=list)
|
|
86
|
+
system_prompt: Optional[str] = None
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def max_score(self) -> int:
|
|
90
|
+
return sum(c.max for c in self.criteria)
|
|
91
|
+
|
|
92
|
+
def serialize(self) -> dict:
|
|
93
|
+
d: dict = {
|
|
94
|
+
"type": "structured",
|
|
95
|
+
"criteria": [c.serialize() for c in self.criteria],
|
|
96
|
+
}
|
|
97
|
+
if self.system_prompt is not None:
|
|
98
|
+
d["system_prompt"] = self.system_prompt
|
|
99
|
+
return d
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Image:
|
|
103
|
+
"""Reference to an image for LLM judge grading.
|
|
104
|
+
|
|
105
|
+
Use the static constructors to create instances:
|
|
106
|
+
Image.s3("s3://bucket/key") - S3 URL, fetched server-side
|
|
107
|
+
Image.from_url("https://...") - HTTP URL, fetched server-side
|
|
108
|
+
Image.from_base64(data, "file.png") - Inline base64 data
|
|
109
|
+
Image.from_env(env, "plot.png") - Collect from environment
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(
|
|
113
|
+
self,
|
|
114
|
+
*,
|
|
115
|
+
source: str,
|
|
116
|
+
url: Optional[str] = None,
|
|
117
|
+
data: Optional[str] = None,
|
|
118
|
+
filename: Optional[str] = None,
|
|
119
|
+
media_type: Optional[str] = None,
|
|
120
|
+
_env: Optional[Any] = None,
|
|
121
|
+
):
|
|
122
|
+
self.source = source
|
|
123
|
+
self.url = url
|
|
124
|
+
self.data = data
|
|
125
|
+
self.filename = filename
|
|
126
|
+
self.media_type = media_type
|
|
127
|
+
self._env = _env
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def s3(url: str, media_type: Optional[str] = None) -> "Image":
|
|
131
|
+
"""Reference an image in S3. The orchestrator fetches it server-side."""
|
|
132
|
+
return Image(source="s3", url=url, media_type=media_type)
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
def from_url(url: str, media_type: Optional[str] = None) -> "Image":
|
|
136
|
+
"""Reference an image by HTTP URL. The orchestrator fetches it server-side."""
|
|
137
|
+
return Image(source="url", url=url, media_type=media_type)
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def from_base64(
|
|
141
|
+
data: str, filename: str = "image.png", media_type: Optional[str] = None
|
|
142
|
+
) -> "Image":
|
|
143
|
+
"""Inline base64 image data."""
|
|
144
|
+
return Image(
|
|
145
|
+
source="base64",
|
|
146
|
+
data=data,
|
|
147
|
+
filename=filename,
|
|
148
|
+
media_type=media_type or _guess_media_type(filename),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def from_env(env: Any, filename: str) -> "Image":
|
|
153
|
+
"""Collect an image from the environment.
|
|
154
|
+
|
|
155
|
+
In non-agentic mode, the SDK collects the image client-side (DB -> notebook -> filesystem)
|
|
156
|
+
and sends base64 to the orchestrator.
|
|
157
|
+
|
|
158
|
+
In agentic mode, only the filename hint is sent and the orchestrator collects it.
|
|
159
|
+
"""
|
|
160
|
+
return Image(source="env", filename=filename, _env=env)
|
|
161
|
+
|
|
162
|
+
def serialize(self, *, label: Optional[str] = None, agentic: bool = False) -> dict:
|
|
163
|
+
"""Serialize for the orchestrator API request body."""
|
|
164
|
+
d: dict
|
|
165
|
+
if self.source == "s3":
|
|
166
|
+
d = {"source": "s3", "url": self.url}
|
|
167
|
+
if self.media_type:
|
|
168
|
+
d["media_type"] = self.media_type
|
|
169
|
+
elif self.source == "url":
|
|
170
|
+
d = {"source": "url", "url": self.url}
|
|
171
|
+
if self.media_type:
|
|
172
|
+
d["media_type"] = self.media_type
|
|
173
|
+
elif self.source == "base64":
|
|
174
|
+
d = {
|
|
175
|
+
"source": "base64",
|
|
176
|
+
"data": self.data,
|
|
177
|
+
"media_type": self.media_type or _guess_media_type(self.filename or "image.png"),
|
|
178
|
+
}
|
|
179
|
+
elif self.source == "env":
|
|
180
|
+
if agentic:
|
|
181
|
+
d = {"source": "collect", "selector": self.filename}
|
|
182
|
+
else:
|
|
183
|
+
b64 = _collect_image_from_env(self._env, self.filename)
|
|
184
|
+
if b64 is None:
|
|
185
|
+
d = {"source": "collect", "selector": self.filename}
|
|
186
|
+
else:
|
|
187
|
+
d = {
|
|
188
|
+
"source": "base64",
|
|
189
|
+
"data": b64,
|
|
190
|
+
"media_type": _guess_media_type(self.filename or "image.png"),
|
|
191
|
+
}
|
|
192
|
+
else:
|
|
193
|
+
raise ValueError(f"Unknown image source: {self.source}")
|
|
194
|
+
|
|
195
|
+
if label is not None:
|
|
196
|
+
d["label"] = label
|
|
197
|
+
return d
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class JudgeResult(float):
|
|
201
|
+
"""Float subclass that carries grading details.
|
|
202
|
+
|
|
203
|
+
Can be returned directly from a verifier function (it IS a float),
|
|
204
|
+
but also carries structured metadata from the judge response.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
def __new__(cls, score: float, *, details: Optional[dict] = None):
|
|
208
|
+
instance = super().__new__(cls, score)
|
|
209
|
+
instance.details = details or {} # type: ignore[attr-defined]
|
|
210
|
+
instance.criteria = instance.details.get("criteria", []) # type: ignore[attr-defined]
|
|
211
|
+
instance.feedback = instance.details.get("feedback", "") # type: ignore[attr-defined]
|
|
212
|
+
instance.execution_id = instance.details.get("execution_id", "") # type: ignore[attr-defined]
|
|
213
|
+
return instance
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ---------------------------------------------------------------------------
|
|
217
|
+
# Image collection helpers
|
|
218
|
+
# ---------------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _extract_query_rows(result: Any) -> List[Dict[str, Any]]:
|
|
222
|
+
"""Extract rows from a query response, handling various formats."""
|
|
223
|
+
if result is None:
|
|
224
|
+
return []
|
|
225
|
+
# QueryResponse with columns/rows
|
|
226
|
+
cols = getattr(result, "columns", None)
|
|
227
|
+
rows = getattr(result, "rows", None)
|
|
228
|
+
if isinstance(cols, list) and isinstance(rows, list):
|
|
229
|
+
return [
|
|
230
|
+
{str(cols[i]): row[i] for i in range(min(len(cols), len(row)))}
|
|
231
|
+
if isinstance(row, (list, tuple))
|
|
232
|
+
else row
|
|
233
|
+
for row in rows
|
|
234
|
+
if isinstance(row, (list, tuple, dict))
|
|
235
|
+
]
|
|
236
|
+
# Dict with columns/rows
|
|
237
|
+
if isinstance(result, dict):
|
|
238
|
+
cols = result.get("columns")
|
|
239
|
+
rows = result.get("rows")
|
|
240
|
+
if isinstance(cols, list) and isinstance(rows, list):
|
|
241
|
+
return [
|
|
242
|
+
{str(cols[i]): row[i] for i in range(min(len(cols), len(row)))}
|
|
243
|
+
if isinstance(row, (list, tuple))
|
|
244
|
+
else row
|
|
245
|
+
for row in rows
|
|
246
|
+
if isinstance(row, (list, tuple, dict))
|
|
247
|
+
]
|
|
248
|
+
# Plain list of dicts
|
|
249
|
+
if isinstance(result, list):
|
|
250
|
+
return [row for row in result if isinstance(row, dict)]
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _collect_image_from_env(env: Any, filename: str) -> Optional[str]:
|
|
255
|
+
"""Collect an image from the environment using DB -> notebook -> filesystem strategies.
|
|
256
|
+
|
|
257
|
+
Returns base64-encoded image data, or None if not found.
|
|
258
|
+
"""
|
|
259
|
+
# Strategy 1: DB files table
|
|
260
|
+
try:
|
|
261
|
+
current = env.db("current")
|
|
262
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
263
|
+
rows = _extract_query_rows(
|
|
264
|
+
current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
265
|
+
)
|
|
266
|
+
candidates = {}
|
|
267
|
+
for row in rows:
|
|
268
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
269
|
+
if path and chex:
|
|
270
|
+
try:
|
|
271
|
+
candidates[path] = bytes.fromhex(chex)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
# Prefer non-dataroom paths
|
|
275
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
276
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
277
|
+
if best:
|
|
278
|
+
logger.debug("Loaded image from DB: %s", best[0])
|
|
279
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.debug("DB image query failed: %s", e)
|
|
282
|
+
|
|
283
|
+
# Strategy 2: Notebook cell outputs
|
|
284
|
+
try:
|
|
285
|
+
current = env.db("current")
|
|
286
|
+
nb_rows = _extract_query_rows(
|
|
287
|
+
current.query(
|
|
288
|
+
"SELECT path, hex(content) AS content_hex FROM files "
|
|
289
|
+
"WHERE path LIKE 'notebooks/%.ipynb'"
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
for nb_row in nb_rows:
|
|
293
|
+
chex = nb_row.get("content_hex", "")
|
|
294
|
+
if not chex:
|
|
295
|
+
continue
|
|
296
|
+
try:
|
|
297
|
+
nb_bytes = bytes.fromhex(chex)
|
|
298
|
+
nb = json.loads(nb_bytes.decode("utf-8"))
|
|
299
|
+
for cell in reversed(nb.get("cells", [])):
|
|
300
|
+
for output in cell.get("outputs", []):
|
|
301
|
+
if output.get("output_type") in ("display_data", "execute_result"):
|
|
302
|
+
img_data = output.get("data", {}).get("image/png")
|
|
303
|
+
if img_data:
|
|
304
|
+
if isinstance(img_data, list):
|
|
305
|
+
img_data = "".join(img_data)
|
|
306
|
+
img_data = img_data.strip()
|
|
307
|
+
if img_data:
|
|
308
|
+
logger.debug("Loaded image from notebook: %s", nb_row.get("path"))
|
|
309
|
+
return img_data
|
|
310
|
+
except Exception:
|
|
311
|
+
pass
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.debug("Notebook image query failed: %s", e)
|
|
314
|
+
|
|
315
|
+
# Strategy 3: Filesystem fallback
|
|
316
|
+
search_paths = [
|
|
317
|
+
filename,
|
|
318
|
+
f"/app/workspace/{filename}",
|
|
319
|
+
f"/workspace/{filename}",
|
|
320
|
+
]
|
|
321
|
+
for fp in search_paths:
|
|
322
|
+
try:
|
|
323
|
+
if os.path.exists(fp):
|
|
324
|
+
with open(fp, "rb") as f:
|
|
325
|
+
logger.debug("Loaded image from filesystem: %s", fp)
|
|
326
|
+
return base64.b64encode(f.read()).decode()
|
|
327
|
+
except Exception:
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
# Accumulator printing (verifier protocol)
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _print_accumulators(data: dict) -> None:
|
|
339
|
+
"""Print error/success accumulators from orchestrator response (verifier protocol)."""
|
|
340
|
+
acc = data.get("accumulators")
|
|
341
|
+
if not acc:
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
errors = acc.get("errors")
|
|
345
|
+
if errors:
|
|
346
|
+
print("[STDOUT] >>> ERROR_ACCUMULATOR >>>")
|
|
347
|
+
print(json.dumps(errors))
|
|
348
|
+
print("<<< ERROR_ACCUMULATOR <<<")
|
|
349
|
+
|
|
350
|
+
successes = acc.get("successes")
|
|
351
|
+
if successes:
|
|
352
|
+
print(">>> SUCCESS_ACCUMULATOR >>>")
|
|
353
|
+
print(json.dumps(successes))
|
|
354
|
+
print("<<< SUCCESS_ACCUMULATOR <<<")
|
|
355
|
+
|
|
356
|
+
grading_details = acc.get("grading_details")
|
|
357
|
+
if grading_details:
|
|
358
|
+
print(">>> GRADING_DETAILS >>>")
|
|
359
|
+
print(json.dumps(grading_details))
|
|
360
|
+
print("<<< GRADING_DETAILS <<<")
|
|
361
|
+
|
|
362
|
+
timing = acc.get("timing")
|
|
363
|
+
if timing:
|
|
364
|
+
print(
|
|
365
|
+
f">>> TIMING: started={timing.get('started_ms')}, "
|
|
366
|
+
f"finished={timing.get('finished_ms')}, "
|
|
367
|
+
f"duration={timing.get('duration_ms')}ms <<<"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# ---------------------------------------------------------------------------
|
|
372
|
+
# Request body builder (shared by sync and async)
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _build_grade_request(
|
|
377
|
+
instance_id: str,
|
|
378
|
+
rubric: Union[str, Rubric],
|
|
379
|
+
submission: Optional[str],
|
|
380
|
+
*,
|
|
381
|
+
ground_truth: Optional[Union[str, dict]] = None,
|
|
382
|
+
problem: Optional[str] = None,
|
|
383
|
+
context: Optional[str] = None,
|
|
384
|
+
reference_claims: Optional[str] = None,
|
|
385
|
+
conversation: Optional[List[dict]] = None,
|
|
386
|
+
images: Optional[Dict[str, Image]] = None,
|
|
387
|
+
model: Optional[str] = None,
|
|
388
|
+
provider: Optional[str] = None,
|
|
389
|
+
agentic: bool = False,
|
|
390
|
+
collect: Optional[Dict[str, List[str]]] = None,
|
|
391
|
+
task_id: Optional[str] = None,
|
|
392
|
+
) -> dict:
|
|
393
|
+
"""Build the JSON request body for POST /v1/judge/grade."""
|
|
394
|
+
body: Dict[str, Any] = {
|
|
395
|
+
"instance_id": instance_id,
|
|
396
|
+
"submission": submission,
|
|
397
|
+
"agentic": agentic,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
# Rubric
|
|
401
|
+
if isinstance(rubric, str):
|
|
402
|
+
body["rubric"] = {"type": "string", "text": rubric}
|
|
403
|
+
elif isinstance(rubric, Rubric):
|
|
404
|
+
body["rubric"] = rubric.serialize()
|
|
405
|
+
else:
|
|
406
|
+
raise TypeError(f"rubric must be str or Rubric, got {type(rubric)}")
|
|
407
|
+
|
|
408
|
+
# Optional fields
|
|
409
|
+
if ground_truth is not None:
|
|
410
|
+
body["ground_truth"] = ground_truth
|
|
411
|
+
if problem is not None:
|
|
412
|
+
body["problem"] = problem
|
|
413
|
+
if reference_claims is not None:
|
|
414
|
+
# Fold reference_claims into context
|
|
415
|
+
if context:
|
|
416
|
+
context = f"{context}\n\n## Reference Claims\n{reference_claims}"
|
|
417
|
+
else:
|
|
418
|
+
context = f"## Reference Claims\n{reference_claims}"
|
|
419
|
+
if context is not None:
|
|
420
|
+
body["context"] = context
|
|
421
|
+
if conversation is not None:
|
|
422
|
+
body["conversation"] = [
|
|
423
|
+
{"role": m["role"], "content": m["content"]} for m in conversation
|
|
424
|
+
]
|
|
425
|
+
if model is not None:
|
|
426
|
+
body["model"] = model
|
|
427
|
+
if provider is not None:
|
|
428
|
+
body["provider"] = provider
|
|
429
|
+
if task_id is not None:
|
|
430
|
+
body["task_id"] = task_id
|
|
431
|
+
if collect is not None:
|
|
432
|
+
body["collect"] = collect
|
|
433
|
+
|
|
434
|
+
# Serialize images as labeled array
|
|
435
|
+
if images:
|
|
436
|
+
body["images"] = [
|
|
437
|
+
img.serialize(label=label, agentic=agentic)
|
|
438
|
+
for label, img in images.items()
|
|
439
|
+
]
|
|
440
|
+
|
|
441
|
+
return body
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _parse_grade_response(data: dict) -> JudgeResult:
|
|
445
|
+
"""Parse orchestrator response into JudgeResult and print accumulators."""
|
|
446
|
+
_print_accumulators(data)
|
|
447
|
+
score = float(data.get("normalized_score", 0.0))
|
|
448
|
+
return JudgeResult(score, details=data)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# ---------------------------------------------------------------------------
|
|
452
|
+
# Sync judge
|
|
453
|
+
# ---------------------------------------------------------------------------
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
class SyncJudge:
|
|
457
|
+
"""LLM-as-judge grading — calls orchestrator API, not environment API.
|
|
458
|
+
|
|
459
|
+
Accessed as env.judge on SyncEnv instances.
|
|
460
|
+
"""
|
|
461
|
+
|
|
462
|
+
def __init__(self, client: "SyncWrapper", instance_id: str):
|
|
463
|
+
self._client = client
|
|
464
|
+
self._instance_id = instance_id
|
|
465
|
+
|
|
466
|
+
def grade(
|
|
467
|
+
self,
|
|
468
|
+
rubric: Union[str, Rubric],
|
|
469
|
+
submission: Optional[str] = None,
|
|
470
|
+
*,
|
|
471
|
+
ground_truth: Optional[Union[str, dict]] = None,
|
|
472
|
+
problem: Optional[str] = None,
|
|
473
|
+
context: Optional[str] = None,
|
|
474
|
+
reference_claims: Optional[str] = None,
|
|
475
|
+
conversation: Optional[List[dict]] = None,
|
|
476
|
+
images: Optional[Dict[str, Image]] = None,
|
|
477
|
+
model: Optional[str] = None,
|
|
478
|
+
provider: Optional[str] = None,
|
|
479
|
+
agentic: bool = False,
|
|
480
|
+
collect: Optional[Dict[str, List[str]]] = None,
|
|
481
|
+
task_id: Optional[str] = None,
|
|
482
|
+
) -> JudgeResult:
|
|
483
|
+
"""Grade a submission using LLM-as-judge via the orchestrator API.
|
|
484
|
+
|
|
485
|
+
Returns a JudgeResult (float subclass with .details, .criteria, .feedback)
|
|
486
|
+
that can be returned directly from a verifier function.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
rubric: Grading rubric — either a string or a structured Rubric object.
|
|
490
|
+
submission: The agent's final answer / submission text.
|
|
491
|
+
ground_truth: Expected answer (string or dict).
|
|
492
|
+
problem: The original problem statement.
|
|
493
|
+
context: Additional context for the judge.
|
|
494
|
+
reference_claims: Reference analysis claims (folded into context).
|
|
495
|
+
conversation: Conversation history as list of message dicts.
|
|
496
|
+
images: List of Image objects for the judge.
|
|
497
|
+
model: Override LLM model (server picks default if None).
|
|
498
|
+
provider: Override LLM provider (server picks default if None).
|
|
499
|
+
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
500
|
+
collect: File patterns for orchestrator to collect (agentic mode).
|
|
501
|
+
task_id: Optional task ID for tracking.
|
|
502
|
+
"""
|
|
503
|
+
body = _build_grade_request(
|
|
504
|
+
self._instance_id,
|
|
505
|
+
rubric,
|
|
506
|
+
submission,
|
|
507
|
+
ground_truth=ground_truth,
|
|
508
|
+
problem=problem,
|
|
509
|
+
context=context,
|
|
510
|
+
reference_claims=reference_claims,
|
|
511
|
+
conversation=conversation,
|
|
512
|
+
images=images,
|
|
513
|
+
model=model,
|
|
514
|
+
provider=provider,
|
|
515
|
+
agentic=agentic,
|
|
516
|
+
collect=collect,
|
|
517
|
+
task_id=task_id,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
response = self._client.request("POST", "/v1/judge/grade", json=body)
|
|
521
|
+
return _parse_grade_response(response.json())
|
|
@@ -301,6 +301,14 @@ class FilesystemResource(Resource):
|
|
|
301
301
|
response = self.client.request(
|
|
302
302
|
"POST", "/fs/file", json=request.model_dump()
|
|
303
303
|
)
|
|
304
|
+
if response.status_code == 404:
|
|
305
|
+
return FileStateResponse(
|
|
306
|
+
success=True, path=path, exists=False,
|
|
307
|
+
message=response.json().get("detail", "File not found"),
|
|
308
|
+
)
|
|
309
|
+
if response.status_code >= 400:
|
|
310
|
+
detail = response.json().get("detail", response.text)
|
|
311
|
+
raise RuntimeError(f"Failed to get file state for '{path}': {detail}")
|
|
304
312
|
return FileStateResponse(**response.json())
|
|
305
313
|
|
|
306
314
|
def file_text(self, path: str, max_content_size: int = 102400) -> str:
|
|
@@ -34,6 +34,7 @@ fleet/client.py
|
|
|
34
34
|
fleet/config.py
|
|
35
35
|
fleet/exceptions.py
|
|
36
36
|
fleet/global_client.py
|
|
37
|
+
fleet/judge.py
|
|
37
38
|
fleet/models.py
|
|
38
39
|
fleet/tasks.py
|
|
39
40
|
fleet/types.py
|
|
@@ -42,6 +43,7 @@ fleet/_async/base.py
|
|
|
42
43
|
fleet/_async/client.py
|
|
43
44
|
fleet/_async/exceptions.py
|
|
44
45
|
fleet/_async/global_client.py
|
|
46
|
+
fleet/_async/judge.py
|
|
45
47
|
fleet/_async/models.py
|
|
46
48
|
fleet/_async/tasks.py
|
|
47
49
|
fleet/_async/env/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|