fleet-python 0.2.112__tar.gz → 0.2.114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.112/fleet_python.egg-info → fleet_python-0.2.114}/PKG-INFO +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/__init__.py +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/judge.py +28 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/base.py +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/judge.py +159 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/pyproject.toml +1 -1
- {fleet_python-0.2.112 → fleet_python-0.2.114}/LICENSE +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/README.md +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/diff_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_account.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_sync.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_task.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/export_tasks_filtered.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/openai_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/quickstart.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/api.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/filesystem.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/agent.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/mcp/main.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/orchestrator.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/types.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/cli.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/config.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/env/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/global_client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/models.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/api.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/filesystem.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/tasks.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/types.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet_python.egg-info/SOURCES.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/scripts/unasync.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/setup.cfg +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/__init__.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_expect_exactly.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.112 → fleet_python-0.2.114}/tests/test_verifier_from_string.py +0 -0
|
@@ -12,7 +12,10 @@ from ..judge import (
|
|
|
12
12
|
JudgeResult,
|
|
13
13
|
Rubric,
|
|
14
14
|
_build_grade_request,
|
|
15
|
+
_collect_image_from_env_async,
|
|
16
|
+
_guess_media_type,
|
|
15
17
|
_parse_grade_response,
|
|
18
|
+
_print_judge_call_start,
|
|
16
19
|
)
|
|
17
20
|
|
|
18
21
|
if TYPE_CHECKING:
|
|
@@ -75,6 +78,29 @@ class AsyncJudge:
|
|
|
75
78
|
collect: File patterns for orchestrator to collect (agentic mode).
|
|
76
79
|
task_id: Optional task ID for tracking.
|
|
77
80
|
"""
|
|
81
|
+
# Resolve Image.from_env images asynchronously before building request
|
|
82
|
+
resolved_images = images
|
|
83
|
+
if images and not agentic:
|
|
84
|
+
resolved_images = {}
|
|
85
|
+
for label, img in images.items():
|
|
86
|
+
if img.source == "env" and img._env is not None:
|
|
87
|
+
b64 = await _collect_image_from_env_async(img._env, img.filename)
|
|
88
|
+
if b64 is not None:
|
|
89
|
+
resolved_images[label] = Image.from_base64(
|
|
90
|
+
b64,
|
|
91
|
+
img.filename or "image.png",
|
|
92
|
+
_guess_media_type(img.filename or "image.png"),
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
# Async collection failed — use collect source directly
|
|
96
|
+
# (don't keep the env image or serialize() will retry sync)
|
|
97
|
+
resolved_images[label] = Image(
|
|
98
|
+
source="collect",
|
|
99
|
+
filename=img.filename,
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
resolved_images[label] = img
|
|
103
|
+
|
|
78
104
|
body = _build_grade_request(
|
|
79
105
|
self._instance_id,
|
|
80
106
|
rubric,
|
|
@@ -84,7 +110,7 @@ class AsyncJudge:
|
|
|
84
110
|
context=context,
|
|
85
111
|
reference_claims=reference_claims,
|
|
86
112
|
conversation=conversation,
|
|
87
|
-
images=
|
|
113
|
+
images=resolved_images,
|
|
88
114
|
model=model,
|
|
89
115
|
provider=provider,
|
|
90
116
|
agentic=agentic,
|
|
@@ -92,5 +118,6 @@ class AsyncJudge:
|
|
|
92
118
|
task_id=task_id,
|
|
93
119
|
)
|
|
94
120
|
|
|
121
|
+
_print_judge_call_start(rubric, resolved_images, agentic, model)
|
|
95
122
|
response = await self._client.request("POST", "/v1/judge/grade", json=body)
|
|
96
123
|
return _parse_grade_response(response.json())
|
|
@@ -176,6 +176,8 @@ class Image:
|
|
|
176
176
|
"data": self.data,
|
|
177
177
|
"media_type": self.media_type or _guess_media_type(self.filename or "image.png"),
|
|
178
178
|
}
|
|
179
|
+
elif self.source == "collect":
|
|
180
|
+
d = {"source": "collect", "selector": self.filename}
|
|
179
181
|
elif self.source == "env":
|
|
180
182
|
if agentic:
|
|
181
183
|
d = {"source": "collect", "selector": self.filename}
|
|
@@ -330,6 +332,86 @@ def _collect_image_from_env(env: Any, filename: str) -> Optional[str]:
|
|
|
330
332
|
return None
|
|
331
333
|
|
|
332
334
|
|
|
335
|
+
async def _collect_image_from_env_async(env: Any, filename: str) -> Optional[str]:
|
|
336
|
+
"""Async version of _collect_image_from_env.
|
|
337
|
+
|
|
338
|
+
Collects an image from an AsyncEnv using DB -> notebook -> filesystem strategies.
|
|
339
|
+
Returns base64-encoded image data, or None if not found.
|
|
340
|
+
"""
|
|
341
|
+
# Strategy 1: DB files table
|
|
342
|
+
try:
|
|
343
|
+
current = env.db("current")
|
|
344
|
+
where = f"path = '{filename}' OR path LIKE '%/{filename}'"
|
|
345
|
+
rows = _extract_query_rows(
|
|
346
|
+
await current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
|
|
347
|
+
)
|
|
348
|
+
candidates = {}
|
|
349
|
+
for row in rows:
|
|
350
|
+
path, chex = row.get("path", ""), row.get("content_hex", "")
|
|
351
|
+
if path and chex:
|
|
352
|
+
try:
|
|
353
|
+
candidates[path] = bytes.fromhex(chex)
|
|
354
|
+
except Exception:
|
|
355
|
+
pass
|
|
356
|
+
# Prefer non-dataroom paths
|
|
357
|
+
non_dr = [p for p in candidates if not p.startswith("dataroom/")]
|
|
358
|
+
best = sorted(non_dr or list(candidates.keys()), key=len)
|
|
359
|
+
if best:
|
|
360
|
+
logger.debug("Loaded image from DB (async): %s", best[0])
|
|
361
|
+
return base64.b64encode(candidates[best[0]]).decode()
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.debug("DB image query failed (async): %s", e)
|
|
364
|
+
|
|
365
|
+
# Strategy 2: Notebook cell outputs
|
|
366
|
+
try:
|
|
367
|
+
current = env.db("current")
|
|
368
|
+
nb_rows = _extract_query_rows(
|
|
369
|
+
await current.query(
|
|
370
|
+
"SELECT path, hex(content) AS content_hex FROM files "
|
|
371
|
+
"WHERE path LIKE 'notebooks/%.ipynb'"
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
for nb_row in nb_rows:
|
|
375
|
+
chex = nb_row.get("content_hex", "")
|
|
376
|
+
if not chex:
|
|
377
|
+
continue
|
|
378
|
+
try:
|
|
379
|
+
nb_bytes = bytes.fromhex(chex)
|
|
380
|
+
nb = json.loads(nb_bytes.decode("utf-8"))
|
|
381
|
+
for cell in reversed(nb.get("cells", [])):
|
|
382
|
+
for output in cell.get("outputs", []):
|
|
383
|
+
if output.get("output_type") in ("display_data", "execute_result"):
|
|
384
|
+
img_data = output.get("data", {}).get("image/png")
|
|
385
|
+
if img_data:
|
|
386
|
+
if isinstance(img_data, list):
|
|
387
|
+
img_data = "".join(img_data)
|
|
388
|
+
img_data = img_data.strip()
|
|
389
|
+
if img_data:
|
|
390
|
+
logger.debug("Loaded image from notebook (async): %s", nb_row.get("path"))
|
|
391
|
+
return img_data
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.debug("Notebook image query failed (async): %s", e)
|
|
396
|
+
|
|
397
|
+
# Strategy 3: Filesystem fallback
|
|
398
|
+
search_paths = [
|
|
399
|
+
filename,
|
|
400
|
+
f"/app/workspace/{filename}",
|
|
401
|
+
f"/workspace/{filename}",
|
|
402
|
+
]
|
|
403
|
+
for fp in search_paths:
|
|
404
|
+
try:
|
|
405
|
+
if os.path.exists(fp):
|
|
406
|
+
with open(fp, "rb") as f:
|
|
407
|
+
logger.debug("Loaded image from filesystem (async): %s", fp)
|
|
408
|
+
return base64.b64encode(f.read()).decode()
|
|
409
|
+
except Exception:
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
|
|
333
415
|
# ---------------------------------------------------------------------------
|
|
334
416
|
# Accumulator printing (verifier protocol)
|
|
335
417
|
# ---------------------------------------------------------------------------
|
|
@@ -359,6 +441,12 @@ def _print_accumulators(data: dict) -> None:
|
|
|
359
441
|
print(json.dumps(grading_details))
|
|
360
442
|
print("<<< GRADING_DETAILS <<<")
|
|
361
443
|
|
|
444
|
+
golden_urls = acc.get("golden_urls")
|
|
445
|
+
if golden_urls:
|
|
446
|
+
print(">>> GOLDEN_URLS >>>")
|
|
447
|
+
print(json.dumps(golden_urls))
|
|
448
|
+
print("<<< GOLDEN_URLS <<<")
|
|
449
|
+
|
|
362
450
|
timing = acc.get("timing")
|
|
363
451
|
if timing:
|
|
364
452
|
print(
|
|
@@ -373,6 +461,34 @@ def _print_accumulators(data: dict) -> None:
|
|
|
373
461
|
# ---------------------------------------------------------------------------
|
|
374
462
|
|
|
375
463
|
|
|
464
|
+
def _print_judge_call_start(
|
|
465
|
+
rubric: Union[str, "Rubric"],
|
|
466
|
+
images: Optional[Dict[str, "Image"]],
|
|
467
|
+
agentic: bool,
|
|
468
|
+
model: Optional[str],
|
|
469
|
+
) -> None:
|
|
470
|
+
"""Print info when initiating a judge grading call."""
|
|
471
|
+
mode = "agentic" if agentic else "standard"
|
|
472
|
+
model_str = model or "default"
|
|
473
|
+
print(f"[C] Calling judge ({mode} mode, model={model_str})")
|
|
474
|
+
|
|
475
|
+
if isinstance(rubric, Rubric):
|
|
476
|
+
criteria_names = [c.name for c in rubric.criteria]
|
|
477
|
+
print(f"[C] Rubric: {len(rubric.criteria)} criteria ({', '.join(criteria_names)}), max={rubric.max_score}")
|
|
478
|
+
|
|
479
|
+
if images:
|
|
480
|
+
for label, img in images.items():
|
|
481
|
+
src = img.source
|
|
482
|
+
detail = ""
|
|
483
|
+
if img.url:
|
|
484
|
+
detail = f" url={img.url}"
|
|
485
|
+
elif img.filename:
|
|
486
|
+
detail = f" file={img.filename}"
|
|
487
|
+
print(f"[C] Image '{label}': source={src}{detail}")
|
|
488
|
+
else:
|
|
489
|
+
print("[C] No images provided")
|
|
490
|
+
|
|
491
|
+
|
|
376
492
|
def _build_grade_request(
|
|
377
493
|
instance_id: str,
|
|
378
494
|
rubric: Union[str, Rubric],
|
|
@@ -443,11 +559,53 @@ def _build_grade_request(
|
|
|
443
559
|
|
|
444
560
|
def _parse_grade_response(data: dict) -> JudgeResult:
|
|
445
561
|
"""Parse orchestrator response into JudgeResult and print accumulators."""
|
|
562
|
+
# Print detailed judge grading info
|
|
563
|
+
_print_judge_result(data)
|
|
446
564
|
_print_accumulators(data)
|
|
447
565
|
score = float(data.get("normalized_score", 0.0))
|
|
448
566
|
return JudgeResult(score, details=data)
|
|
449
567
|
|
|
450
568
|
|
|
569
|
+
def _print_judge_result(data: dict) -> None:
|
|
570
|
+
"""Print detailed judge grading result for verifier stdout capture."""
|
|
571
|
+
model = data.get("model_used", "unknown")
|
|
572
|
+
provider = data.get("provider_used", "unknown")
|
|
573
|
+
total = data.get("total_score", 0)
|
|
574
|
+
max_score = data.get("max_score", 0)
|
|
575
|
+
normalized = data.get("normalized_score", 0)
|
|
576
|
+
elapsed = (data.get("accumulators") or {}).get("elapsed_ms")
|
|
577
|
+
|
|
578
|
+
print(f"[C] Grading via {model} (provider={provider})")
|
|
579
|
+
if elapsed is not None:
|
|
580
|
+
print(f"[C] Judge call completed in {elapsed:.0f}ms")
|
|
581
|
+
|
|
582
|
+
criteria = data.get("criteria")
|
|
583
|
+
if criteria:
|
|
584
|
+
print(f"[C] Score: {total}/{max_score} ({normalized:.2f})")
|
|
585
|
+
for c in criteria:
|
|
586
|
+
name = c.get("name", "?")
|
|
587
|
+
cscore = c.get("score", "?")
|
|
588
|
+
cmax = c.get("max_score", "?")
|
|
589
|
+
reasoning = c.get("reasoning", "")
|
|
590
|
+
# Truncate long reasoning for stdout readability
|
|
591
|
+
if len(reasoning) > 200:
|
|
592
|
+
reasoning = reasoning[:200] + "..."
|
|
593
|
+
print(f"[C] {name}: {cscore}/{cmax} — {reasoning}")
|
|
594
|
+
else:
|
|
595
|
+
print(f"[C] Score: {normalized:.2f}")
|
|
596
|
+
|
|
597
|
+
feedback = data.get("feedback")
|
|
598
|
+
if feedback:
|
|
599
|
+
fb_display = feedback if len(feedback) <= 300 else feedback[:300] + "..."
|
|
600
|
+
print(f"[C] Feedback: {fb_display}")
|
|
601
|
+
|
|
602
|
+
# Print golden URLs if present in accumulators
|
|
603
|
+
golden_urls = (data.get("accumulators") or {}).get("golden_urls")
|
|
604
|
+
if golden_urls:
|
|
605
|
+
for url in golden_urls:
|
|
606
|
+
print(f"[C] Gold reference: {url}")
|
|
607
|
+
|
|
608
|
+
|
|
451
609
|
# ---------------------------------------------------------------------------
|
|
452
610
|
# Sync judge
|
|
453
611
|
# ---------------------------------------------------------------------------
|
|
@@ -517,5 +675,6 @@ class SyncJudge:
|
|
|
517
675
|
task_id=task_id,
|
|
518
676
|
)
|
|
519
677
|
|
|
678
|
+
_print_judge_call_start(rubric, images, agentic, model)
|
|
520
679
|
response = self._client.request("POST", "/v1/judge/grade", json=body)
|
|
521
680
|
return _parse_grade_response(response.json())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|