fleet-python 0.2.114__tar.gz → 0.2.116__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.114/fleet_python.egg-info → fleet_python-0.2.116}/PKG-INFO +1 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/__init__.py +3 -2
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/client.py +0 -2
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/judge.py +30 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/base.py +1 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/client.py +0 -2
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/judge.py +336 -2
- {fleet_python-0.2.114 → fleet_python-0.2.116/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet_python.egg-info/SOURCES.txt +1 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/pyproject.toml +1 -1
- fleet_python-0.2.116/tests/test_judge_criteria_markers.py +192 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/LICENSE +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/README.md +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/diff_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_account.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_sync.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_task.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/export_tasks_filtered.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/openai_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/quickstart.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/api.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/filesystem.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/agent.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp/main.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/orchestrator.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/types.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/cli.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/config.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/env/client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/global_client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/models.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/api.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/filesystem.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/tasks.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/types.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/scripts/unasync.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/setup.cfg +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/__init__.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_expect_exactly.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.114 → fleet_python-0.2.116}/tests/test_verifier_from_string.py +0 -0
|
@@ -69,14 +69,14 @@ from .tasks import (
|
|
|
69
69
|
from .types import VerifierFunction
|
|
70
70
|
|
|
71
71
|
# Import judge data classes
|
|
72
|
-
from .judge import Rubric, Criterion, Image, JudgeResult
|
|
72
|
+
from .judge import Rubric, Criterion, File, Image, JudgeResult
|
|
73
73
|
|
|
74
74
|
# Create a module-level env attribute for convenient access
|
|
75
75
|
from . import env
|
|
76
76
|
from . import global_client as _global_client
|
|
77
77
|
from ._async import global_client as _async_global_client
|
|
78
78
|
|
|
79
|
-
__version__ = "0.2.
|
|
79
|
+
__version__ = "0.2.116"
|
|
80
80
|
|
|
81
81
|
__all__ = [
|
|
82
82
|
# Core classes
|
|
@@ -96,6 +96,7 @@ __all__ = [
|
|
|
96
96
|
# Judge
|
|
97
97
|
"Rubric",
|
|
98
98
|
"Criterion",
|
|
99
|
+
"File",
|
|
99
100
|
"Image",
|
|
100
101
|
"JudgeResult",
|
|
101
102
|
# Exceptions
|
|
@@ -601,7 +601,6 @@ class AsyncFleet:
|
|
|
601
601
|
)
|
|
602
602
|
|
|
603
603
|
instance = AsyncEnv(client=self.client, **response.json())
|
|
604
|
-
await instance.instance.load()
|
|
605
604
|
return instance
|
|
606
605
|
|
|
607
606
|
async def make_for_task(self, task: Task) -> AsyncEnv:
|
|
@@ -653,7 +652,6 @@ class AsyncFleet:
|
|
|
653
652
|
else:
|
|
654
653
|
response = await self.client.request("GET", f"/v1/env/instances/{instance_id}")
|
|
655
654
|
instance = AsyncEnv(client=self.client, **response.json())
|
|
656
|
-
await instance.instance.load()
|
|
657
655
|
return instance
|
|
658
656
|
|
|
659
657
|
def _create_url_instance(self, base_url: str) -> AsyncEnv:
|
|
@@ -8,11 +8,14 @@ from typing import Dict, List, Optional, Union, TYPE_CHECKING
|
|
|
8
8
|
# Import shared classes and helpers from the sync module
|
|
9
9
|
from ..judge import (
|
|
10
10
|
Criterion,
|
|
11
|
+
File,
|
|
11
12
|
Image,
|
|
12
13
|
JudgeResult,
|
|
13
14
|
Rubric,
|
|
14
15
|
_build_grade_request,
|
|
16
|
+
_collect_file_from_env_async,
|
|
15
17
|
_collect_image_from_env_async,
|
|
18
|
+
_guess_file_media_type,
|
|
16
19
|
_guess_media_type,
|
|
17
20
|
_parse_grade_response,
|
|
18
21
|
_print_judge_call_start,
|
|
@@ -25,6 +28,7 @@ if TYPE_CHECKING:
|
|
|
25
28
|
__all__ = [
|
|
26
29
|
"AsyncJudge",
|
|
27
30
|
"Criterion",
|
|
31
|
+
"File",
|
|
28
32
|
"Image",
|
|
29
33
|
"JudgeResult",
|
|
30
34
|
"Rubric",
|
|
@@ -52,6 +56,7 @@ class AsyncJudge:
|
|
|
52
56
|
reference_claims: Optional[str] = None,
|
|
53
57
|
conversation: Optional[List[dict]] = None,
|
|
54
58
|
images: Optional[Dict[str, Image]] = None,
|
|
59
|
+
files: Optional[Dict[str, File]] = None,
|
|
55
60
|
model: Optional[str] = None,
|
|
56
61
|
provider: Optional[str] = None,
|
|
57
62
|
agentic: bool = False,
|
|
@@ -72,6 +77,7 @@ class AsyncJudge:
|
|
|
72
77
|
reference_claims: Reference analysis claims.
|
|
73
78
|
conversation: Conversation history as list of message dicts.
|
|
74
79
|
images: Named images for the judge (e.g., gold reference, agent output).
|
|
80
|
+
files: Named files for the judge (PDF, CSV, STEP, etc.).
|
|
75
81
|
model: Override LLM model (server picks default if None).
|
|
76
82
|
provider: Override LLM provider (server picks default if None).
|
|
77
83
|
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
@@ -101,6 +107,28 @@ class AsyncJudge:
|
|
|
101
107
|
else:
|
|
102
108
|
resolved_images[label] = img
|
|
103
109
|
|
|
110
|
+
# Resolve File.from_env files asynchronously before building request
|
|
111
|
+
resolved_files = files
|
|
112
|
+
if files and not agentic:
|
|
113
|
+
resolved_files = {}
|
|
114
|
+
for label, f in files.items():
|
|
115
|
+
if f.source == "env" and f._env is not None:
|
|
116
|
+
b64 = await _collect_file_from_env_async(f._env, f.filename)
|
|
117
|
+
if b64 is not None:
|
|
118
|
+
resolved_files[label] = File.from_base64(
|
|
119
|
+
b64,
|
|
120
|
+
f.filename or "file",
|
|
121
|
+
_guess_file_media_type(f.filename or "file"),
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
# Async collection failed — use collect source directly
|
|
125
|
+
resolved_files[label] = File(
|
|
126
|
+
source="collect",
|
|
127
|
+
filename=f.filename,
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
resolved_files[label] = f
|
|
131
|
+
|
|
104
132
|
body = _build_grade_request(
|
|
105
133
|
self._instance_id,
|
|
106
134
|
rubric,
|
|
@@ -111,6 +139,7 @@ class AsyncJudge:
|
|
|
111
139
|
reference_claims=reference_claims,
|
|
112
140
|
conversation=conversation,
|
|
113
141
|
images=resolved_images,
|
|
142
|
+
files=resolved_files,
|
|
114
143
|
model=model,
|
|
115
144
|
provider=provider,
|
|
116
145
|
agentic=agentic,
|
|
@@ -118,6 +147,6 @@ class AsyncJudge:
|
|
|
118
147
|
task_id=task_id,
|
|
119
148
|
)
|
|
120
149
|
|
|
121
|
-
_print_judge_call_start(rubric, resolved_images, agentic, model)
|
|
150
|
+
_print_judge_call_start(rubric, resolved_images, agentic, model, files=resolved_files)
|
|
122
151
|
response = await self._client.request("POST", "/v1/judge/grade", json=body)
|
|
123
152
|
return _parse_grade_response(response.json())
|
|
@@ -613,7 +613,6 @@ class Fleet:
|
|
|
613
613
|
)
|
|
614
614
|
|
|
615
615
|
instance = SyncEnv(client=self.client, **response.json())
|
|
616
|
-
instance.instance.load()
|
|
617
616
|
return instance
|
|
618
617
|
|
|
619
618
|
def make_for_task(self, task: Task) -> SyncEnv:
|
|
@@ -665,7 +664,6 @@ class Fleet:
|
|
|
665
664
|
else:
|
|
666
665
|
response = self.client.request("GET", f"/v1/env/instances/{instance_id}")
|
|
667
666
|
instance = SyncEnv(client=self.client, **response.json())
|
|
668
|
-
instance.instance.load()
|
|
669
667
|
return instance
|
|
670
668
|
|
|
671
669
|
def _create_url_instance(self, base_url: str) -> SyncEnv:
|
|
@@ -38,6 +38,47 @@ def _guess_media_type(filename: str) -> str:
|
|
|
38
38
|
}.get(ext, "image/png")
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
def _guess_file_media_type(filename: str) -> str:
|
|
42
|
+
"""Guess media type from filename extension for arbitrary files.
|
|
43
|
+
|
|
44
|
+
Broader than _guess_media_type — covers documents, CAD, data formats, etc.
|
|
45
|
+
"""
|
|
46
|
+
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
|
|
47
|
+
return {
|
|
48
|
+
# Images
|
|
49
|
+
"png": "image/png",
|
|
50
|
+
"jpg": "image/jpeg",
|
|
51
|
+
"jpeg": "image/jpeg",
|
|
52
|
+
"gif": "image/gif",
|
|
53
|
+
"webp": "image/webp",
|
|
54
|
+
"svg": "image/svg+xml",
|
|
55
|
+
# Documents
|
|
56
|
+
"pdf": "application/pdf",
|
|
57
|
+
"txt": "text/plain",
|
|
58
|
+
"md": "text/markdown",
|
|
59
|
+
"html": "text/html",
|
|
60
|
+
"htm": "text/html",
|
|
61
|
+
"csv": "text/csv",
|
|
62
|
+
"tsv": "text/tab-separated-values",
|
|
63
|
+
# Data
|
|
64
|
+
"json": "application/json",
|
|
65
|
+
"xml": "application/xml",
|
|
66
|
+
"yaml": "application/x-yaml",
|
|
67
|
+
"yml": "application/x-yaml",
|
|
68
|
+
# CAD / Engineering
|
|
69
|
+
"step": "application/step",
|
|
70
|
+
"stp": "application/step",
|
|
71
|
+
"stl": "model/stl",
|
|
72
|
+
"iges": "model/iges",
|
|
73
|
+
"igs": "model/iges",
|
|
74
|
+
"obj": "model/obj",
|
|
75
|
+
# Archives
|
|
76
|
+
"zip": "application/zip",
|
|
77
|
+
"gz": "application/gzip",
|
|
78
|
+
"tar": "application/x-tar",
|
|
79
|
+
}.get(ext, "application/octet-stream")
|
|
80
|
+
|
|
81
|
+
|
|
41
82
|
@dataclass
|
|
42
83
|
class Criterion:
|
|
43
84
|
"""A single rubric criterion for grading.
|
|
@@ -199,6 +240,99 @@ class Image:
|
|
|
199
240
|
return d
|
|
200
241
|
|
|
201
242
|
|
|
243
|
+
class File:
|
|
244
|
+
"""Reference to an arbitrary file for LLM judge grading.
|
|
245
|
+
|
|
246
|
+
Supports any file type (PDF, CSV, STEP, STL, etc.) via the Anthropic
|
|
247
|
+
Files API. Use the static constructors to create instances:
|
|
248
|
+
File.s3("s3://bucket/key") - S3 URL, fetched server-side
|
|
249
|
+
File.from_base64(data, "part.step", "application/step") - Inline base64 data
|
|
250
|
+
File.from_env(env, "exported_part.step") - Collect from environment
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def __init__(
|
|
254
|
+
self,
|
|
255
|
+
*,
|
|
256
|
+
source: str,
|
|
257
|
+
url: Optional[str] = None,
|
|
258
|
+
data: Optional[str] = None,
|
|
259
|
+
filename: Optional[str] = None,
|
|
260
|
+
media_type: Optional[str] = None,
|
|
261
|
+
_env: Optional[Any] = None,
|
|
262
|
+
):
|
|
263
|
+
self.source = source
|
|
264
|
+
self.url = url
|
|
265
|
+
self.data = data
|
|
266
|
+
self.filename = filename
|
|
267
|
+
self.media_type = media_type
|
|
268
|
+
self._env = _env
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def s3(url: str, media_type: Optional[str] = None) -> "File":
|
|
272
|
+
"""Reference a file in S3. The orchestrator fetches it server-side."""
|
|
273
|
+
return File(source="s3", url=url, media_type=media_type)
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def from_base64(
|
|
277
|
+
data: str, filename: str, media_type: Optional[str] = None
|
|
278
|
+
) -> "File":
|
|
279
|
+
"""Inline base64 file data."""
|
|
280
|
+
return File(
|
|
281
|
+
source="base64",
|
|
282
|
+
data=data,
|
|
283
|
+
filename=filename,
|
|
284
|
+
media_type=media_type or _guess_file_media_type(filename),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def from_env(env: Any, filename: str) -> "File":
|
|
289
|
+
"""Collect a file from the environment.
|
|
290
|
+
|
|
291
|
+
In non-agentic mode, the SDK collects the file client-side (DB -> filesystem)
|
|
292
|
+
and sends base64 to the orchestrator.
|
|
293
|
+
|
|
294
|
+
In agentic mode, only the filename hint is sent and the orchestrator collects it.
|
|
295
|
+
"""
|
|
296
|
+
return File(source="env", filename=filename, _env=env)
|
|
297
|
+
|
|
298
|
+
def serialize(self, *, label: Optional[str] = None, agentic: bool = False) -> dict:
|
|
299
|
+
"""Serialize for the orchestrator API request body."""
|
|
300
|
+
d: dict
|
|
301
|
+
if self.source == "s3":
|
|
302
|
+
d = {"source": "s3", "url": self.url}
|
|
303
|
+
if self.media_type:
|
|
304
|
+
d["media_type"] = self.media_type
|
|
305
|
+
elif self.source == "base64":
|
|
306
|
+
d = {
|
|
307
|
+
"source": "base64",
|
|
308
|
+
"data": self.data,
|
|
309
|
+
"filename": self.filename,
|
|
310
|
+
"media_type": self.media_type or _guess_file_media_type(self.filename or "file"),
|
|
311
|
+
}
|
|
312
|
+
elif self.source == "collect":
|
|
313
|
+
d = {"source": "collect", "selector": self.filename}
|
|
314
|
+
elif self.source == "env":
|
|
315
|
+
if agentic:
|
|
316
|
+
d = {"source": "collect", "selector": self.filename}
|
|
317
|
+
else:
|
|
318
|
+
b64 = _collect_file_from_env(self._env, self.filename)
|
|
319
|
+
if b64 is None:
|
|
320
|
+
d = {"source": "collect", "selector": self.filename}
|
|
321
|
+
else:
|
|
322
|
+
d = {
|
|
323
|
+
"source": "base64",
|
|
324
|
+
"data": b64,
|
|
325
|
+
"filename": self.filename,
|
|
326
|
+
"media_type": _guess_file_media_type(self.filename or "file"),
|
|
327
|
+
}
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"Unknown file source: {self.source}")
|
|
330
|
+
|
|
331
|
+
if label is not None:
|
|
332
|
+
d["label"] = label
|
|
333
|
+
return d
|
|
334
|
+
|
|
335
|
+
|
|
202
336
|
class JudgeResult(float):
|
|
203
337
|
"""Float subclass that carries grading details.
|
|
204
338
|
|
|
@@ -412,6 +546,102 @@ async def _collect_image_from_env_async(env: Any, filename: str) -> Optional[str
|
|
|
412
546
|
return None
|
|
413
547
|
|
|
414
548
|
|
|
549
|
+
def _collect_file_from_env(env: Any, filename: str) -> Optional[str]:
|
|
550
|
+
"""Collect a file from the environment using DB -> filesystem strategies.
|
|
551
|
+
|
|
552
|
+
Similar to _collect_image_from_env but skips notebook cell output strategy
|
|
553
|
+
(which is image-specific). Returns base64-encoded file data, or None if not found.
|
|
554
|
+
"""
|
|
555
|
+
# Strategy 1: DB files table
|
|
556
|
+
try:
|
|
557
|
+
current = env.db("current")
|
|
558
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
559
|
+
rows = _extract_query_rows(
|
|
560
|
+
current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
561
|
+
)
|
|
562
|
+
candidates = {}
|
|
563
|
+
for row in rows:
|
|
564
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
565
|
+
if path and chex:
|
|
566
|
+
try:
|
|
567
|
+
candidates[path] = bytes.fromhex(chex)
|
|
568
|
+
except Exception:
|
|
569
|
+
pass
|
|
570
|
+
# Prefer non-dataroom paths
|
|
571
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
572
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
573
|
+
if best:
|
|
574
|
+
logger.debug("Loaded file from DB: %s", best[0])
|
|
575
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
576
|
+
except Exception as e:
|
|
577
|
+
logger.debug("DB file query failed: %s", e)
|
|
578
|
+
|
|
579
|
+
# Strategy 2: Filesystem fallback
|
|
580
|
+
search_paths = [
|
|
581
|
+
filename,
|
|
582
|
+
f"/app/workspace/{filename}",
|
|
583
|
+
f"/workspace/{filename}",
|
|
584
|
+
]
|
|
585
|
+
for fp in search_paths:
|
|
586
|
+
try:
|
|
587
|
+
if os.path.exists(fp):
|
|
588
|
+
with open(fp, "rb") as f:
|
|
589
|
+
logger.debug("Loaded file from filesystem: %s", fp)
|
|
590
|
+
return base64.b64encode(f.read()).decode()
|
|
591
|
+
except Exception:
|
|
592
|
+
pass
|
|
593
|
+
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
async def _collect_file_from_env_async(env: Any, filename: str) -> Optional[str]:
|
|
598
|
+
"""Async version of _collect_file_from_env.
|
|
599
|
+
|
|
600
|
+
Collects a file from an AsyncEnv using DB -> filesystem strategies.
|
|
601
|
+
Returns base64-encoded file data, or None if not found.
|
|
602
|
+
"""
|
|
603
|
+
# Strategy 1: DB files table
|
|
604
|
+
try:
|
|
605
|
+
current = env.db("current")
|
|
606
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
607
|
+
rows = _extract_query_rows(
|
|
608
|
+
await current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
609
|
+
)
|
|
610
|
+
candidates = {}
|
|
611
|
+
for row in rows:
|
|
612
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
613
|
+
if path and chex:
|
|
614
|
+
try:
|
|
615
|
+
candidates[path] = bytes.fromhex(chex)
|
|
616
|
+
except Exception:
|
|
617
|
+
pass
|
|
618
|
+
# Prefer non-dataroom paths
|
|
619
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
620
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
621
|
+
if best:
|
|
622
|
+
logger.debug("Loaded file from DB (async): %s", best[0])
|
|
623
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
624
|
+
except Exception as e:
|
|
625
|
+
logger.debug("DB file query failed (async): %s", e)
|
|
626
|
+
|
|
627
|
+
# Strategy 2: Filesystem fallback
|
|
628
|
+
search_paths = [
|
|
629
|
+
filename,
|
|
630
|
+
f"/app/workspace/{filename}",
|
|
631
|
+
f"/workspace/{filename}",
|
|
632
|
+
]
|
|
633
|
+
for fp in search_paths:
|
|
634
|
+
try:
|
|
635
|
+
if os.path.exists(fp):
|
|
636
|
+
with open(fp, "rb") as f:
|
|
637
|
+
logger.debug("Loaded file from filesystem (async): %s", fp)
|
|
638
|
+
return base64.b64encode(f.read()).decode()
|
|
639
|
+
except Exception:
|
|
640
|
+
pass
|
|
641
|
+
|
|
642
|
+
return None
|
|
643
|
+
|
|
644
|
+
|
|
415
645
|
# ---------------------------------------------------------------------------
|
|
416
646
|
# Accumulator printing (verifier protocol)
|
|
417
647
|
# ---------------------------------------------------------------------------
|
|
@@ -447,6 +677,12 @@ def _print_accumulators(data: dict) -> None:
|
|
|
447
677
|
print(json.dumps(golden_urls))
|
|
448
678
|
print("<<< GOLDEN_URLS <<<")
|
|
449
679
|
|
|
680
|
+
agent_steps = acc.get("agent_steps")
|
|
681
|
+
if agent_steps:
|
|
682
|
+
print(">>> AGENT_STEPS >>>")
|
|
683
|
+
print(json.dumps(agent_steps))
|
|
684
|
+
print("<<< AGENT_STEPS <<<")
|
|
685
|
+
|
|
450
686
|
timing = acc.get("timing")
|
|
451
687
|
if timing:
|
|
452
688
|
print(
|
|
@@ -466,6 +702,7 @@ def _print_judge_call_start(
|
|
|
466
702
|
images: Optional[Dict[str, "Image"]],
|
|
467
703
|
agentic: bool,
|
|
468
704
|
model: Optional[str],
|
|
705
|
+
files: Optional[Dict[str, "File"]] = None,
|
|
469
706
|
) -> None:
|
|
470
707
|
"""Print info when initiating a judge grading call."""
|
|
471
708
|
mode = "agentic" if agentic else "standard"
|
|
@@ -488,6 +725,18 @@ def _print_judge_call_start(
|
|
|
488
725
|
else:
|
|
489
726
|
print("[C] No images provided")
|
|
490
727
|
|
|
728
|
+
if files:
|
|
729
|
+
for label, f in files.items():
|
|
730
|
+
src = f.source
|
|
731
|
+
detail = ""
|
|
732
|
+
if f.url:
|
|
733
|
+
detail = f" url={f.url}"
|
|
734
|
+
elif f.filename:
|
|
735
|
+
detail = f" file={f.filename}"
|
|
736
|
+
if f.media_type:
|
|
737
|
+
detail += f" type={f.media_type}"
|
|
738
|
+
print(f"[C] File '{label}': source={src}{detail}")
|
|
739
|
+
|
|
491
740
|
|
|
492
741
|
def _build_grade_request(
|
|
493
742
|
instance_id: str,
|
|
@@ -500,6 +749,7 @@ def _build_grade_request(
|
|
|
500
749
|
reference_claims: Optional[str] = None,
|
|
501
750
|
conversation: Optional[List[dict]] = None,
|
|
502
751
|
images: Optional[Dict[str, Image]] = None,
|
|
752
|
+
files: Optional[Dict[str, "File"]] = None,
|
|
503
753
|
model: Optional[str] = None,
|
|
504
754
|
provider: Optional[str] = None,
|
|
505
755
|
agentic: bool = False,
|
|
@@ -554,6 +804,13 @@ def _build_grade_request(
|
|
|
554
804
|
for label, img in images.items()
|
|
555
805
|
]
|
|
556
806
|
|
|
807
|
+
# Serialize files as labeled array
|
|
808
|
+
if files:
|
|
809
|
+
body["files"] = [
|
|
810
|
+
f.serialize(label=label, agentic=agentic)
|
|
811
|
+
for label, f in files.items()
|
|
812
|
+
]
|
|
813
|
+
|
|
557
814
|
return body
|
|
558
815
|
|
|
559
816
|
|
|
@@ -566,6 +823,54 @@ def _parse_grade_response(data: dict) -> JudgeResult:
|
|
|
566
823
|
return JudgeResult(score, details=data)
|
|
567
824
|
|
|
568
825
|
|
|
826
|
+
def _print_criteria_markers(criteria: list) -> None:
|
|
827
|
+
"""Emit ``>>> CRITERIA >>>`` stdout markers for structured criteria display.
|
|
828
|
+
|
|
829
|
+
The orchestrator (theseus PR #1967) scans verifier stdout for these
|
|
830
|
+
markers and wraps the execution result so the client (client PR #1737)
|
|
831
|
+
can render an expandable rubric breakdown.
|
|
832
|
+
|
|
833
|
+
Converts from the orchestrator judge-response format::
|
|
834
|
+
|
|
835
|
+
{"name": str, "score": int, "max_score": int, "reasoning": str}
|
|
836
|
+
|
|
837
|
+
to the client-expected marker format::
|
|
838
|
+
|
|
839
|
+
{"criteria": str, "score": float, "score_out_of": float, "description"?: str}
|
|
840
|
+
|
|
841
|
+
Each criterion's score is normalised to a 0.0–1.0 float using its own
|
|
842
|
+
``max_score``.
|
|
843
|
+
"""
|
|
844
|
+
marker_criteria = []
|
|
845
|
+
for c in criteria:
|
|
846
|
+
name = c.get("name", "")
|
|
847
|
+
cscore = c.get("score", 0)
|
|
848
|
+
cmax = c.get("max_score", 0)
|
|
849
|
+
|
|
850
|
+
# Normalise per-criterion score to 0.0–1.0
|
|
851
|
+
if cmax and float(cmax) > 0:
|
|
852
|
+
norm_score = float(cscore) / float(cmax)
|
|
853
|
+
else:
|
|
854
|
+
norm_score = float(cscore)
|
|
855
|
+
|
|
856
|
+
entry: dict = {
|
|
857
|
+
"criteria": name,
|
|
858
|
+
"score": round(norm_score, 4),
|
|
859
|
+
"score_out_of": 1.0,
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
reasoning = c.get("reasoning", "")
|
|
863
|
+
if reasoning:
|
|
864
|
+
entry["description"] = reasoning
|
|
865
|
+
|
|
866
|
+
marker_criteria.append(entry)
|
|
867
|
+
|
|
868
|
+
if marker_criteria:
|
|
869
|
+
print(">>> CRITERIA >>>")
|
|
870
|
+
print(json.dumps(marker_criteria))
|
|
871
|
+
print("<<< CRITERIA <<<")
|
|
872
|
+
|
|
873
|
+
|
|
569
874
|
def _print_judge_result(data: dict) -> None:
|
|
570
875
|
"""Print detailed judge grading result for verifier stdout capture."""
|
|
571
876
|
model = data.get("model_used", "unknown")
|
|
@@ -591,6 +896,12 @@ def _print_judge_result(data: dict) -> None:
|
|
|
591
896
|
if len(reasoning) > 200:
|
|
592
897
|
reasoning = reasoning[:200] + "..."
|
|
593
898
|
print(f"[C] {name}: {cscore}/{cmax} — {reasoning}")
|
|
899
|
+
|
|
900
|
+
# Emit structured criteria via stdout markers so the orchestrator
|
|
901
|
+
# (_extract_criteria_from_stdout) and client can render a rubric
|
|
902
|
+
# breakdown. Schema per element:
|
|
903
|
+
# {"criteria": str, "score": float, "score_out_of": float, "description"?: str}
|
|
904
|
+
_print_criteria_markers(criteria)
|
|
594
905
|
else:
|
|
595
906
|
print(f"[C] Score: {normalized:.2f}")
|
|
596
907
|
|
|
@@ -605,6 +916,26 @@ def _print_judge_result(data: dict) -> None:
|
|
|
605
916
|
for url in golden_urls:
|
|
606
917
|
print(f"[C] Gold reference: {url}")
|
|
607
918
|
|
|
919
|
+
# Print agentic judge steps if present
|
|
920
|
+
agent_steps = (data.get("accumulators") or {}).get("agent_steps")
|
|
921
|
+
if agent_steps:
|
|
922
|
+
print(f"[C] Agentic judge: {len(agent_steps)} steps")
|
|
923
|
+
for step in agent_steps:
|
|
924
|
+
stype = step.get("type", "?")
|
|
925
|
+
if stype == "mcp_connect":
|
|
926
|
+
print(f"[C] MCP connected ({step.get('tools_available', '?')} tools)")
|
|
927
|
+
elif stype == "tool_call":
|
|
928
|
+
tool = step.get("tool", "?")
|
|
929
|
+
turn = step.get("turn", "?")
|
|
930
|
+
is_err = step.get("is_error", False)
|
|
931
|
+
result_preview = step.get("result", "")[:100]
|
|
932
|
+
status = "ERROR" if is_err else "ok"
|
|
933
|
+
print(f"[C] Turn {turn}: {tool}() → {status}: {result_preview}")
|
|
934
|
+
elif stype == "final_response":
|
|
935
|
+
print(f"[C] Turn {step.get('turn', '?')}: final response")
|
|
936
|
+
elif stype == "max_turns_reached":
|
|
937
|
+
print(f"[C] Max turns reached ({step.get('turns_used', '?')})")
|
|
938
|
+
|
|
608
939
|
|
|
609
940
|
# ---------------------------------------------------------------------------
|
|
610
941
|
# Sync judge
|
|
@@ -632,6 +963,7 @@ class SyncJudge:
|
|
|
632
963
|
reference_claims: Optional[str] = None,
|
|
633
964
|
conversation: Optional[List[dict]] = None,
|
|
634
965
|
images: Optional[Dict[str, Image]] = None,
|
|
966
|
+
files: Optional[Dict[str, File]] = None,
|
|
635
967
|
model: Optional[str] = None,
|
|
636
968
|
provider: Optional[str] = None,
|
|
637
969
|
agentic: bool = False,
|
|
@@ -651,7 +983,8 @@ class SyncJudge:
|
|
|
651
983
|
context: Additional context for the judge.
|
|
652
984
|
reference_claims: Reference analysis claims (folded into context).
|
|
653
985
|
conversation: Conversation history as list of message dicts.
|
|
654
|
-
images:
|
|
986
|
+
images: Named Image objects for the judge.
|
|
987
|
+
files: Named File objects for the judge (PDF, CSV, STEP, etc.).
|
|
655
988
|
model: Override LLM model (server picks default if None).
|
|
656
989
|
provider: Override LLM provider (server picks default if None).
|
|
657
990
|
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
@@ -668,6 +1001,7 @@ class SyncJudge:
|
|
|
668
1001
|
reference_claims=reference_claims,
|
|
669
1002
|
conversation=conversation,
|
|
670
1003
|
images=images,
|
|
1004
|
+
files=files,
|
|
671
1005
|
model=model,
|
|
672
1006
|
provider=provider,
|
|
673
1007
|
agentic=agentic,
|
|
@@ -675,6 +1009,6 @@ class SyncJudge:
|
|
|
675
1009
|
task_id=task_id,
|
|
676
1010
|
)
|
|
677
1011
|
|
|
678
|
-
_print_judge_call_start(rubric, images, agentic, model)
|
|
1012
|
+
_print_judge_call_start(rubric, images, agentic, model, files=files)
|
|
679
1013
|
response = self._client.request("POST", "/v1/judge/grade", json=body)
|
|
680
1014
|
return _parse_grade_response(response.json())
|
|
@@ -117,6 +117,7 @@ tests/test_app_method.py
|
|
|
117
117
|
tests/test_expect_exactly.py
|
|
118
118
|
tests/test_expect_only.py
|
|
119
119
|
tests/test_instance_dispatch.py
|
|
120
|
+
tests/test_judge_criteria_markers.py
|
|
120
121
|
tests/test_sqlite_resource_dual_mode.py
|
|
121
122
|
tests/test_sqlite_shared_memory_behavior.py
|
|
122
123
|
tests/test_verifier_from_string.py
|