fleet-python 0.2.113__tar.gz → 0.2.115__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.113/fleet_python.egg-info → fleet_python-0.2.115}/PKG-INFO +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/__init__.py +3 -2
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/judge.py +31 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/base.py +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/judge.py +358 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/pyproject.toml +1 -1
- {fleet_python-0.2.113 → fleet_python-0.2.115}/LICENSE +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/README.md +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/diff_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_account.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_sync.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_task.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/export_tasks_filtered.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/openai_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/quickstart.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/api.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/filesystem.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/agent.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/mcp/main.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/orchestrator.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/types.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/cli.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/config.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/env/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/global_client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/models.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/api.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/filesystem.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/tasks.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/types.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet_python.egg-info/SOURCES.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/scripts/unasync.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/setup.cfg +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/__init__.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_expect_exactly.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.113 → fleet_python-0.2.115}/tests/test_verifier_from_string.py +0 -0
|
@@ -69,14 +69,14 @@ from .tasks import (
|
|
|
69
69
|
from .types import VerifierFunction
|
|
70
70
|
|
|
71
71
|
# Import judge data classes
|
|
72
|
-
from .judge import Rubric, Criterion, Image, JudgeResult
|
|
72
|
+
from .judge import Rubric, Criterion, File, Image, JudgeResult
|
|
73
73
|
|
|
74
74
|
# Create a module-level env attribute for convenient access
|
|
75
75
|
from . import env
|
|
76
76
|
from . import global_client as _global_client
|
|
77
77
|
from ._async import global_client as _async_global_client
|
|
78
78
|
|
|
79
|
-
__version__ = "0.2.
|
|
79
|
+
__version__ = "0.2.115"
|
|
80
80
|
|
|
81
81
|
__all__ = [
|
|
82
82
|
# Core classes
|
|
@@ -96,6 +96,7 @@ __all__ = [
|
|
|
96
96
|
# Judge
|
|
97
97
|
"Rubric",
|
|
98
98
|
"Criterion",
|
|
99
|
+
"File",
|
|
99
100
|
"Image",
|
|
100
101
|
"JudgeResult",
|
|
101
102
|
# Exceptions
|
|
@@ -8,13 +8,17 @@ from typing import Dict, List, Optional, Union, TYPE_CHECKING
|
|
|
8
8
|
# Import shared classes and helpers from the sync module
|
|
9
9
|
from ..judge import (
|
|
10
10
|
Criterion,
|
|
11
|
+
File,
|
|
11
12
|
Image,
|
|
12
13
|
JudgeResult,
|
|
13
14
|
Rubric,
|
|
14
15
|
_build_grade_request,
|
|
16
|
+
_collect_file_from_env_async,
|
|
15
17
|
_collect_image_from_env_async,
|
|
18
|
+
_guess_file_media_type,
|
|
16
19
|
_guess_media_type,
|
|
17
20
|
_parse_grade_response,
|
|
21
|
+
_print_judge_call_start,
|
|
18
22
|
)
|
|
19
23
|
|
|
20
24
|
if TYPE_CHECKING:
|
|
@@ -24,6 +28,7 @@ if TYPE_CHECKING:
|
|
|
24
28
|
__all__ = [
|
|
25
29
|
"AsyncJudge",
|
|
26
30
|
"Criterion",
|
|
31
|
+
"File",
|
|
27
32
|
"Image",
|
|
28
33
|
"JudgeResult",
|
|
29
34
|
"Rubric",
|
|
@@ -51,6 +56,7 @@ class AsyncJudge:
|
|
|
51
56
|
reference_claims: Optional[str] = None,
|
|
52
57
|
conversation: Optional[List[dict]] = None,
|
|
53
58
|
images: Optional[Dict[str, Image]] = None,
|
|
59
|
+
files: Optional[Dict[str, File]] = None,
|
|
54
60
|
model: Optional[str] = None,
|
|
55
61
|
provider: Optional[str] = None,
|
|
56
62
|
agentic: bool = False,
|
|
@@ -71,6 +77,7 @@ class AsyncJudge:
|
|
|
71
77
|
reference_claims: Reference analysis claims.
|
|
72
78
|
conversation: Conversation history as list of message dicts.
|
|
73
79
|
images: Named images for the judge (e.g., gold reference, agent output).
|
|
80
|
+
files: Named files for the judge (PDF, CSV, STEP, etc.).
|
|
74
81
|
model: Override LLM model (server picks default if None).
|
|
75
82
|
provider: Override LLM provider (server picks default if None).
|
|
76
83
|
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
@@ -100,6 +107,28 @@ class AsyncJudge:
|
|
|
100
107
|
else:
|
|
101
108
|
resolved_images[label] = img
|
|
102
109
|
|
|
110
|
+
# Resolve File.from_env files asynchronously before building request
|
|
111
|
+
resolved_files = files
|
|
112
|
+
if files and not agentic:
|
|
113
|
+
resolved_files = {}
|
|
114
|
+
for label, f in files.items():
|
|
115
|
+
if f.source == "env" and f._env is not None:
|
|
116
|
+
b64 = await _collect_file_from_env_async(f._env, f.filename)
|
|
117
|
+
if b64 is not None:
|
|
118
|
+
resolved_files[label] = File.from_base64(
|
|
119
|
+
b64,
|
|
120
|
+
f.filename or "file",
|
|
121
|
+
_guess_file_media_type(f.filename or "file"),
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
# Async collection failed — use collect source directly
|
|
125
|
+
resolved_files[label] = File(
|
|
126
|
+
source="collect",
|
|
127
|
+
filename=f.filename,
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
resolved_files[label] = f
|
|
131
|
+
|
|
103
132
|
body = _build_grade_request(
|
|
104
133
|
self._instance_id,
|
|
105
134
|
rubric,
|
|
@@ -110,6 +139,7 @@ class AsyncJudge:
|
|
|
110
139
|
reference_claims=reference_claims,
|
|
111
140
|
conversation=conversation,
|
|
112
141
|
images=resolved_images,
|
|
142
|
+
files=resolved_files,
|
|
113
143
|
model=model,
|
|
114
144
|
provider=provider,
|
|
115
145
|
agentic=agentic,
|
|
@@ -117,5 +147,6 @@ class AsyncJudge:
|
|
|
117
147
|
task_id=task_id,
|
|
118
148
|
)
|
|
119
149
|
|
|
150
|
+
_print_judge_call_start(rubric, resolved_images, agentic, model, files=resolved_files)
|
|
120
151
|
response = await self._client.request("POST", "/v1/judge/grade", json=body)
|
|
121
152
|
return _parse_grade_response(response.json())
|
|
@@ -38,6 +38,47 @@ def _guess_media_type(filename: str) -> str:
|
|
|
38
38
|
}.get(ext, "image/png")
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
def _guess_file_media_type(filename: str) -> str:
|
|
42
|
+
"""Guess media type from filename extension for arbitrary files.
|
|
43
|
+
|
|
44
|
+
Broader than _guess_media_type — covers documents, CAD, data formats, etc.
|
|
45
|
+
"""
|
|
46
|
+
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
|
|
47
|
+
return {
|
|
48
|
+
# Images
|
|
49
|
+
"png": "image/png",
|
|
50
|
+
"jpg": "image/jpeg",
|
|
51
|
+
"jpeg": "image/jpeg",
|
|
52
|
+
"gif": "image/gif",
|
|
53
|
+
"webp": "image/webp",
|
|
54
|
+
"svg": "image/svg+xml",
|
|
55
|
+
# Documents
|
|
56
|
+
"pdf": "application/pdf",
|
|
57
|
+
"txt": "text/plain",
|
|
58
|
+
"md": "text/markdown",
|
|
59
|
+
"html": "text/html",
|
|
60
|
+
"htm": "text/html",
|
|
61
|
+
"csv": "text/csv",
|
|
62
|
+
"tsv": "text/tab-separated-values",
|
|
63
|
+
# Data
|
|
64
|
+
"json": "application/json",
|
|
65
|
+
"xml": "application/xml",
|
|
66
|
+
"yaml": "application/x-yaml",
|
|
67
|
+
"yml": "application/x-yaml",
|
|
68
|
+
# CAD / Engineering
|
|
69
|
+
"step": "application/step",
|
|
70
|
+
"stp": "application/step",
|
|
71
|
+
"stl": "model/stl",
|
|
72
|
+
"iges": "model/iges",
|
|
73
|
+
"igs": "model/iges",
|
|
74
|
+
"obj": "model/obj",
|
|
75
|
+
# Archives
|
|
76
|
+
"zip": "application/zip",
|
|
77
|
+
"gz": "application/gzip",
|
|
78
|
+
"tar": "application/x-tar",
|
|
79
|
+
}.get(ext, "application/octet-stream")
|
|
80
|
+
|
|
81
|
+
|
|
41
82
|
@dataclass
|
|
42
83
|
class Criterion:
|
|
43
84
|
"""A single rubric criterion for grading.
|
|
@@ -199,6 +240,99 @@ class Image:
|
|
|
199
240
|
return d
|
|
200
241
|
|
|
201
242
|
|
|
243
|
+
class File:
|
|
244
|
+
"""Reference to an arbitrary file for LLM judge grading.
|
|
245
|
+
|
|
246
|
+
Supports any file type (PDF, CSV, STEP, STL, etc.) via the Anthropic
|
|
247
|
+
Files API. Use the static constructors to create instances:
|
|
248
|
+
File.s3("s3://bucket/key") - S3 URL, fetched server-side
|
|
249
|
+
File.from_base64(data, "part.step", "application/step") - Inline base64 data
|
|
250
|
+
File.from_env(env, "exported_part.step") - Collect from environment
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
def __init__(
|
|
254
|
+
self,
|
|
255
|
+
*,
|
|
256
|
+
source: str,
|
|
257
|
+
url: Optional[str] = None,
|
|
258
|
+
data: Optional[str] = None,
|
|
259
|
+
filename: Optional[str] = None,
|
|
260
|
+
media_type: Optional[str] = None,
|
|
261
|
+
_env: Optional[Any] = None,
|
|
262
|
+
):
|
|
263
|
+
self.source = source
|
|
264
|
+
self.url = url
|
|
265
|
+
self.data = data
|
|
266
|
+
self.filename = filename
|
|
267
|
+
self.media_type = media_type
|
|
268
|
+
self._env = _env
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def s3(url: str, media_type: Optional[str] = None) -> "File":
|
|
272
|
+
"""Reference a file in S3. The orchestrator fetches it server-side."""
|
|
273
|
+
return File(source="s3", url=url, media_type=media_type)
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def from_base64(
|
|
277
|
+
data: str, filename: str, media_type: Optional[str] = None
|
|
278
|
+
) -> "File":
|
|
279
|
+
"""Inline base64 file data."""
|
|
280
|
+
return File(
|
|
281
|
+
source="base64",
|
|
282
|
+
data=data,
|
|
283
|
+
filename=filename,
|
|
284
|
+
media_type=media_type or _guess_file_media_type(filename),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def from_env(env: Any, filename: str) -> "File":
|
|
289
|
+
"""Collect a file from the environment.
|
|
290
|
+
|
|
291
|
+
In non-agentic mode, the SDK collects the file client-side (DB -> filesystem)
|
|
292
|
+
and sends base64 to the orchestrator.
|
|
293
|
+
|
|
294
|
+
In agentic mode, only the filename hint is sent and the orchestrator collects it.
|
|
295
|
+
"""
|
|
296
|
+
return File(source="env", filename=filename, _env=env)
|
|
297
|
+
|
|
298
|
+
def serialize(self, *, label: Optional[str] = None, agentic: bool = False) -> dict:
|
|
299
|
+
"""Serialize for the orchestrator API request body."""
|
|
300
|
+
d: dict
|
|
301
|
+
if self.source == "s3":
|
|
302
|
+
d = {"source": "s3", "url": self.url}
|
|
303
|
+
if self.media_type:
|
|
304
|
+
d["media_type"] = self.media_type
|
|
305
|
+
elif self.source == "base64":
|
|
306
|
+
d = {
|
|
307
|
+
"source": "base64",
|
|
308
|
+
"data": self.data,
|
|
309
|
+
"filename": self.filename,
|
|
310
|
+
"media_type": self.media_type or _guess_file_media_type(self.filename or "file"),
|
|
311
|
+
}
|
|
312
|
+
elif self.source == "collect":
|
|
313
|
+
d = {"source": "collect", "selector": self.filename}
|
|
314
|
+
elif self.source == "env":
|
|
315
|
+
if agentic:
|
|
316
|
+
d = {"source": "collect", "selector": self.filename}
|
|
317
|
+
else:
|
|
318
|
+
b64 = _collect_file_from_env(self._env, self.filename)
|
|
319
|
+
if b64 is None:
|
|
320
|
+
d = {"source": "collect", "selector": self.filename}
|
|
321
|
+
else:
|
|
322
|
+
d = {
|
|
323
|
+
"source": "base64",
|
|
324
|
+
"data": b64,
|
|
325
|
+
"filename": self.filename,
|
|
326
|
+
"media_type": _guess_file_media_type(self.filename or "file"),
|
|
327
|
+
}
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"Unknown file source: {self.source}")
|
|
330
|
+
|
|
331
|
+
if label is not None:
|
|
332
|
+
d["label"] = label
|
|
333
|
+
return d
|
|
334
|
+
|
|
335
|
+
|
|
202
336
|
class JudgeResult(float):
|
|
203
337
|
"""Float subclass that carries grading details.
|
|
204
338
|
|
|
@@ -412,6 +546,102 @@ async def _collect_image_from_env_async(env: Any, filename: str) -> Optional[str
|
|
|
412
546
|
return None
|
|
413
547
|
|
|
414
548
|
|
|
549
|
+
def _collect_file_from_env(env: Any, filename: str) -> Optional[str]:
|
|
550
|
+
"""Collect a file from the environment using DB -> filesystem strategies.
|
|
551
|
+
|
|
552
|
+
Similar to _collect_image_from_env but skips notebook cell output strategy
|
|
553
|
+
(which is image-specific). Returns base64-encoded file data, or None if not found.
|
|
554
|
+
"""
|
|
555
|
+
# Strategy 1: DB files table
|
|
556
|
+
try:
|
|
557
|
+
current = env.db("current")
|
|
558
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
559
|
+
rows = _extract_query_rows(
|
|
560
|
+
current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
561
|
+
)
|
|
562
|
+
candidates = {}
|
|
563
|
+
for row in rows:
|
|
564
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
565
|
+
if path and chex:
|
|
566
|
+
try:
|
|
567
|
+
candidates[path] = bytes.fromhex(chex)
|
|
568
|
+
except Exception:
|
|
569
|
+
pass
|
|
570
|
+
# Prefer non-dataroom paths
|
|
571
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
572
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
573
|
+
if best:
|
|
574
|
+
logger.debug("Loaded file from DB: %s", best[0])
|
|
575
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
576
|
+
except Exception as e:
|
|
577
|
+
logger.debug("DB file query failed: %s", e)
|
|
578
|
+
|
|
579
|
+
# Strategy 2: Filesystem fallback
|
|
580
|
+
search_paths = [
|
|
581
|
+
filename,
|
|
582
|
+
f"/app/workspace/{filename}",
|
|
583
|
+
f"/workspace/{filename}",
|
|
584
|
+
]
|
|
585
|
+
for fp in search_paths:
|
|
586
|
+
try:
|
|
587
|
+
if os.path.exists(fp):
|
|
588
|
+
with open(fp, "rb") as f:
|
|
589
|
+
logger.debug("Loaded file from filesystem: %s", fp)
|
|
590
|
+
return base64.b64encode(f.read()).decode()
|
|
591
|
+
except Exception:
|
|
592
|
+
pass
|
|
593
|
+
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
async def _collect_file_from_env_async(env: Any, filename: str) -> Optional[str]:
|
|
598
|
+
"""Async version of _collect_file_from_env.
|
|
599
|
+
|
|
600
|
+
Collects a file from an AsyncEnv using DB -> filesystem strategies.
|
|
601
|
+
Returns base64-encoded file data, or None if not found.
|
|
602
|
+
"""
|
|
603
|
+
# Strategy 1: DB files table
|
|
604
|
+
try:
|
|
605
|
+
current = env.db("current")
|
|
606
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
607
|
+
rows = _extract_query_rows(
|
|
608
|
+
await current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
609
|
+
)
|
|
610
|
+
candidates = {}
|
|
611
|
+
for row in rows:
|
|
612
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
613
|
+
if path and chex:
|
|
614
|
+
try:
|
|
615
|
+
candidates[path] = bytes.fromhex(chex)
|
|
616
|
+
except Exception:
|
|
617
|
+
pass
|
|
618
|
+
# Prefer non-dataroom paths
|
|
619
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
620
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
621
|
+
if best:
|
|
622
|
+
logger.debug("Loaded file from DB (async): %s", best[0])
|
|
623
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
624
|
+
except Exception as e:
|
|
625
|
+
logger.debug("DB file query failed (async): %s", e)
|
|
626
|
+
|
|
627
|
+
# Strategy 2: Filesystem fallback
|
|
628
|
+
search_paths = [
|
|
629
|
+
filename,
|
|
630
|
+
f"/app/workspace/{filename}",
|
|
631
|
+
f"/workspace/{filename}",
|
|
632
|
+
]
|
|
633
|
+
for fp in search_paths:
|
|
634
|
+
try:
|
|
635
|
+
if os.path.exists(fp):
|
|
636
|
+
with open(fp, "rb") as f:
|
|
637
|
+
logger.debug("Loaded file from filesystem (async): %s", fp)
|
|
638
|
+
return base64.b64encode(f.read()).decode()
|
|
639
|
+
except Exception:
|
|
640
|
+
pass
|
|
641
|
+
|
|
642
|
+
return None
|
|
643
|
+
|
|
644
|
+
|
|
415
645
|
# ---------------------------------------------------------------------------
|
|
416
646
|
# Accumulator printing (verifier protocol)
|
|
417
647
|
# ---------------------------------------------------------------------------
|
|
@@ -441,6 +671,18 @@ def _print_accumulators(data: dict) -> None:
|
|
|
441
671
|
print(json.dumps(grading_details))
|
|
442
672
|
print("<<< GRADING_DETAILS <<<")
|
|
443
673
|
|
|
674
|
+
golden_urls = acc.get("golden_urls")
|
|
675
|
+
if golden_urls:
|
|
676
|
+
print(">>> GOLDEN_URLS >>>")
|
|
677
|
+
print(json.dumps(golden_urls))
|
|
678
|
+
print("<<< GOLDEN_URLS <<<")
|
|
679
|
+
|
|
680
|
+
agent_steps = acc.get("agent_steps")
|
|
681
|
+
if agent_steps:
|
|
682
|
+
print(">>> AGENT_STEPS >>>")
|
|
683
|
+
print(json.dumps(agent_steps))
|
|
684
|
+
print("<<< AGENT_STEPS <<<")
|
|
685
|
+
|
|
444
686
|
timing = acc.get("timing")
|
|
445
687
|
if timing:
|
|
446
688
|
print(
|
|
@@ -455,6 +697,47 @@ def _print_accumulators(data: dict) -> None:
|
|
|
455
697
|
# ---------------------------------------------------------------------------
|
|
456
698
|
|
|
457
699
|
|
|
700
|
+
def _print_judge_call_start(
|
|
701
|
+
rubric: Union[str, "Rubric"],
|
|
702
|
+
images: Optional[Dict[str, "Image"]],
|
|
703
|
+
agentic: bool,
|
|
704
|
+
model: Optional[str],
|
|
705
|
+
files: Optional[Dict[str, "File"]] = None,
|
|
706
|
+
) -> None:
|
|
707
|
+
"""Print info when initiating a judge grading call."""
|
|
708
|
+
mode = "agentic" if agentic else "standard"
|
|
709
|
+
model_str = model or "default"
|
|
710
|
+
print(f"[C] Calling judge ({mode} mode, model={model_str})")
|
|
711
|
+
|
|
712
|
+
if isinstance(rubric, Rubric):
|
|
713
|
+
criteria_names = [c.name for c in rubric.criteria]
|
|
714
|
+
print(f"[C] Rubric: {len(rubric.criteria)} criteria ({', '.join(criteria_names)}), max={rubric.max_score}")
|
|
715
|
+
|
|
716
|
+
if images:
|
|
717
|
+
for label, img in images.items():
|
|
718
|
+
src = img.source
|
|
719
|
+
detail = ""
|
|
720
|
+
if img.url:
|
|
721
|
+
detail = f" url={img.url}"
|
|
722
|
+
elif img.filename:
|
|
723
|
+
detail = f" file={img.filename}"
|
|
724
|
+
print(f"[C] Image '{label}': source={src}{detail}")
|
|
725
|
+
else:
|
|
726
|
+
print("[C] No images provided")
|
|
727
|
+
|
|
728
|
+
if files:
|
|
729
|
+
for label, f in files.items():
|
|
730
|
+
src = f.source
|
|
731
|
+
detail = ""
|
|
732
|
+
if f.url:
|
|
733
|
+
detail = f" url={f.url}"
|
|
734
|
+
elif f.filename:
|
|
735
|
+
detail = f" file={f.filename}"
|
|
736
|
+
if f.media_type:
|
|
737
|
+
detail += f" type={f.media_type}"
|
|
738
|
+
print(f"[C] File '{label}': source={src}{detail}")
|
|
739
|
+
|
|
740
|
+
|
|
458
741
|
def _build_grade_request(
|
|
459
742
|
instance_id: str,
|
|
460
743
|
rubric: Union[str, Rubric],
|
|
@@ -466,6 +749,7 @@ def _build_grade_request(
|
|
|
466
749
|
reference_claims: Optional[str] = None,
|
|
467
750
|
conversation: Optional[List[dict]] = None,
|
|
468
751
|
images: Optional[Dict[str, Image]] = None,
|
|
752
|
+
files: Optional[Dict[str, "File"]] = None,
|
|
469
753
|
model: Optional[str] = None,
|
|
470
754
|
provider: Optional[str] = None,
|
|
471
755
|
agentic: bool = False,
|
|
@@ -520,16 +804,85 @@ def _build_grade_request(
|
|
|
520
804
|
for label, img in images.items()
|
|
521
805
|
]
|
|
522
806
|
|
|
807
|
+
# Serialize files as labeled array
|
|
808
|
+
if files:
|
|
809
|
+
body["files"] = [
|
|
810
|
+
f.serialize(label=label, agentic=agentic)
|
|
811
|
+
for label, f in files.items()
|
|
812
|
+
]
|
|
813
|
+
|
|
523
814
|
return body
|
|
524
815
|
|
|
525
816
|
|
|
526
817
|
def _parse_grade_response(data: dict) -> JudgeResult:
|
|
527
818
|
"""Parse orchestrator response into JudgeResult and print accumulators."""
|
|
819
|
+
# Print detailed judge grading info
|
|
820
|
+
_print_judge_result(data)
|
|
528
821
|
_print_accumulators(data)
|
|
529
822
|
score = float(data.get("normalized_score", 0.0))
|
|
530
823
|
return JudgeResult(score, details=data)
|
|
531
824
|
|
|
532
825
|
|
|
826
|
+
def _print_judge_result(data: dict) -> None:
|
|
827
|
+
"""Print detailed judge grading result for verifier stdout capture."""
|
|
828
|
+
model = data.get("model_used", "unknown")
|
|
829
|
+
provider = data.get("provider_used", "unknown")
|
|
830
|
+
total = data.get("total_score", 0)
|
|
831
|
+
max_score = data.get("max_score", 0)
|
|
832
|
+
normalized = data.get("normalized_score", 0)
|
|
833
|
+
elapsed = (data.get("accumulators") or {}).get("elapsed_ms")
|
|
834
|
+
|
|
835
|
+
print(f"[C] Grading via {model} (provider={provider})")
|
|
836
|
+
if elapsed is not None:
|
|
837
|
+
print(f"[C] Judge call completed in {elapsed:.0f}ms")
|
|
838
|
+
|
|
839
|
+
criteria = data.get("criteria")
|
|
840
|
+
if criteria:
|
|
841
|
+
print(f"[C] Score: {total}/{max_score} ({normalized:.2f})")
|
|
842
|
+
for c in criteria:
|
|
843
|
+
name = c.get("name", "?")
|
|
844
|
+
cscore = c.get("score", "?")
|
|
845
|
+
cmax = c.get("max_score", "?")
|
|
846
|
+
reasoning = c.get("reasoning", "")
|
|
847
|
+
# Truncate long reasoning for stdout readability
|
|
848
|
+
if len(reasoning) > 200:
|
|
849
|
+
reasoning = reasoning[:200] + "..."
|
|
850
|
+
print(f"[C] {name}: {cscore}/{cmax} — {reasoning}")
|
|
851
|
+
else:
|
|
852
|
+
print(f"[C] Score: {normalized:.2f}")
|
|
853
|
+
|
|
854
|
+
feedback = data.get("feedback")
|
|
855
|
+
if feedback:
|
|
856
|
+
fb_display = feedback if len(feedback) <= 300 else feedback[:300] + "..."
|
|
857
|
+
print(f"[C] Feedback: {fb_display}")
|
|
858
|
+
|
|
859
|
+
# Print golden URLs if present in accumulators
|
|
860
|
+
golden_urls = (data.get("accumulators") or {}).get("golden_urls")
|
|
861
|
+
if golden_urls:
|
|
862
|
+
for url in golden_urls:
|
|
863
|
+
print(f"[C] Gold reference: {url}")
|
|
864
|
+
|
|
865
|
+
# Print agentic judge steps if present
|
|
866
|
+
agent_steps = (data.get("accumulators") or {}).get("agent_steps")
|
|
867
|
+
if agent_steps:
|
|
868
|
+
print(f"[C] Agentic judge: {len(agent_steps)} steps")
|
|
869
|
+
for step in agent_steps:
|
|
870
|
+
stype = step.get("type", "?")
|
|
871
|
+
if stype == "mcp_connect":
|
|
872
|
+
print(f"[C] MCP connected ({step.get('tools_available', '?')} tools)")
|
|
873
|
+
elif stype == "tool_call":
|
|
874
|
+
tool = step.get("tool", "?")
|
|
875
|
+
turn = step.get("turn", "?")
|
|
876
|
+
is_err = step.get("is_error", False)
|
|
877
|
+
result_preview = step.get("result", "")[:100]
|
|
878
|
+
status = "ERROR" if is_err else "ok"
|
|
879
|
+
print(f"[C] Turn {turn}: {tool}() → {status}: {result_preview}")
|
|
880
|
+
elif stype == "final_response":
|
|
881
|
+
print(f"[C] Turn {step.get('turn', '?')}: final response")
|
|
882
|
+
elif stype == "max_turns_reached":
|
|
883
|
+
print(f"[C] Max turns reached ({step.get('turns_used', '?')})")
|
|
884
|
+
|
|
885
|
+
|
|
533
886
|
# ---------------------------------------------------------------------------
|
|
534
887
|
# Sync judge
|
|
535
888
|
# ---------------------------------------------------------------------------
|
|
@@ -556,6 +909,7 @@ class SyncJudge:
|
|
|
556
909
|
reference_claims: Optional[str] = None,
|
|
557
910
|
conversation: Optional[List[dict]] = None,
|
|
558
911
|
images: Optional[Dict[str, Image]] = None,
|
|
912
|
+
files: Optional[Dict[str, File]] = None,
|
|
559
913
|
model: Optional[str] = None,
|
|
560
914
|
provider: Optional[str] = None,
|
|
561
915
|
agentic: bool = False,
|
|
@@ -575,7 +929,8 @@ class SyncJudge:
|
|
|
575
929
|
context: Additional context for the judge.
|
|
576
930
|
reference_claims: Reference analysis claims (folded into context).
|
|
577
931
|
conversation: Conversation history as list of message dicts.
|
|
578
|
-
images:
|
|
932
|
+
images: Named Image objects for the judge.
|
|
933
|
+
files: Named File objects for the judge (PDF, CSV, STEP, etc.).
|
|
579
934
|
model: Override LLM model (server picks default if None).
|
|
580
935
|
provider: Override LLM provider (server picks default if None).
|
|
581
936
|
agentic: If True, the orchestrator collects artifacts from the instance.
|
|
@@ -592,6 +947,7 @@ class SyncJudge:
|
|
|
592
947
|
reference_claims=reference_claims,
|
|
593
948
|
conversation=conversation,
|
|
594
949
|
images=images,
|
|
950
|
+
files=files,
|
|
595
951
|
model=model,
|
|
596
952
|
provider=provider,
|
|
597
953
|
agentic=agentic,
|
|
@@ -599,5 +955,6 @@ class SyncJudge:
|
|
|
599
955
|
task_id=task_id,
|
|
600
956
|
)
|
|
601
957
|
|
|
958
|
+
_print_judge_call_start(rubric, images, agentic, model, files=files)
|
|
602
959
|
response = self._client.request("POST", "/v1/judge/grade", json=body)
|
|
603
960
|
return _parse_grade_response(response.json())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|