fleet-python 0.2.110__tar.gz → 0.2.112__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {fleet_python-0.2.110/fleet_python.egg-info → fleet_python-0.2.112}/PKG-INFO +1 -1
  2. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/__init__.py +9 -1
  3. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/__init__.py +1 -1
  4. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/base.py +1 -1
  5. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/client.py +14 -0
  6. fleet_python-0.2.112/fleet/_async/judge.py +96 -0
  7. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/filesystem.py +8 -0
  8. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/base.py +1 -1
  9. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/client.py +14 -0
  10. fleet_python-0.2.112/fleet/judge.py +521 -0
  11. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/filesystem.py +8 -0
  12. {fleet_python-0.2.110 → fleet_python-0.2.112/fleet_python.egg-info}/PKG-INFO +1 -1
  13. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/SOURCES.txt +2 -0
  14. {fleet_python-0.2.110 → fleet_python-0.2.112}/pyproject.toml +1 -1
  15. {fleet_python-0.2.110 → fleet_python-0.2.112}/LICENSE +0 -0
  16. {fleet_python-0.2.110 → fleet_python-0.2.112}/README.md +0 -0
  17. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/diff_example.py +0 -0
  18. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/dsl_example.py +0 -0
  19. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example.py +0 -0
  20. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/exampleResume.py +0 -0
  21. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_account.py +0 -0
  22. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_action_log.py +0 -0
  23. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_client.py +0 -0
  24. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_mcp_anthropic.py +0 -0
  25. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_mcp_openai.py +0 -0
  26. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_sync.py +0 -0
  27. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_task.py +0 -0
  28. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_tasks.py +0 -0
  29. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/example_verifier.py +0 -0
  30. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/export_tasks.py +0 -0
  31. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/export_tasks_filtered.py +0 -0
  32. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/fetch_tasks.py +0 -0
  33. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/gemini_example.py +0 -0
  34. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/import_tasks.py +0 -0
  35. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/iterate_verifiers.py +0 -0
  36. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/json_tasks_example.py +0 -0
  37. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/nova_act_example.py +0 -0
  38. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/openai_example.py +0 -0
  39. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/openai_simple_example.py +0 -0
  40. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/query_builder_example.py +0 -0
  41. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/quickstart.py +0 -0
  42. {fleet_python-0.2.110 → fleet_python-0.2.112}/examples/test_cdp_logging.py +0 -0
  43. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/env/__init__.py +0 -0
  44. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/env/client.py +0 -0
  45. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/exceptions.py +0 -0
  46. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/global_client.py +0 -0
  47. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/__init__.py +0 -0
  48. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/base.py +0 -0
  49. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/instance/client.py +0 -0
  50. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/models.py +0 -0
  51. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/__init__.py +0 -0
  52. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/api.py +0 -0
  53. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/base.py +0 -0
  54. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/browser.py +0 -0
  55. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/mcp.py +0 -0
  56. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/resources/sqlite.py +0 -0
  57. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/tasks.py +0 -0
  58. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/__init__.py +0 -0
  59. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/bundler.py +0 -0
  60. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/_async/verifiers/verifier.py +0 -0
  61. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/__init__.py +0 -0
  62. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/Dockerfile +0 -0
  63. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/__init__.py +0 -0
  64. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/agent.py +0 -0
  65. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp/main.py +0 -0
  66. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
  67. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
  68. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
  69. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/requirements.txt +0 -0
  70. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/gemini_cua/start.sh +0 -0
  71. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/orchestrator.py +0 -0
  72. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/types.py +0 -0
  73. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/agent/utils.py +0 -0
  74. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/cli.py +0 -0
  75. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/config.py +0 -0
  76. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/env/__init__.py +0 -0
  77. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/env/client.py +0 -0
  78. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/eval/__init__.py +0 -0
  79. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/eval/uploader.py +0 -0
  80. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/exceptions.py +0 -0
  81. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/global_client.py +0 -0
  82. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/__init__.py +0 -0
  83. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/base.py +0 -0
  84. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/client.py +0 -0
  85. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/instance/models.py +0 -0
  86. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/models.py +0 -0
  87. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/__init__.py +0 -0
  88. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/proxy.py +0 -0
  89. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/proxy/whitelist.py +0 -0
  90. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/__init__.py +0 -0
  91. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/api.py +0 -0
  92. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/base.py +0 -0
  93. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/browser.py +0 -0
  94. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/mcp.py +0 -0
  95. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/resources/sqlite.py +0 -0
  96. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/tasks.py +0 -0
  97. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/types.py +0 -0
  98. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/__init__.py +0 -0
  99. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/http_logging.py +0 -0
  100. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/logging.py +0 -0
  101. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/utils/playwright.py +0 -0
  102. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/__init__.py +0 -0
  103. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/bundler.py +0 -0
  104. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/code.py +0 -0
  105. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/db.py +0 -0
  106. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/decorator.py +0 -0
  107. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/parse.py +0 -0
  108. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/sql_differ.py +0 -0
  109. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet/verifiers/verifier.py +0 -0
  110. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/dependency_links.txt +0 -0
  111. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/entry_points.txt +0 -0
  112. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/requires.txt +0 -0
  113. {fleet_python-0.2.110 → fleet_python-0.2.112}/fleet_python.egg-info/top_level.txt +0 -0
  114. {fleet_python-0.2.110 → fleet_python-0.2.112}/scripts/fix_sync_imports.py +0 -0
  115. {fleet_python-0.2.110 → fleet_python-0.2.112}/scripts/unasync.py +0 -0
  116. {fleet_python-0.2.110 → fleet_python-0.2.112}/setup.cfg +0 -0
  117. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/__init__.py +0 -0
  118. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_app_method.py +0 -0
  119. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_expect_exactly.py +0 -0
  120. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_expect_only.py +0 -0
  121. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_instance_dispatch.py +0 -0
  122. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_sqlite_resource_dual_mode.py +0 -0
  123. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_sqlite_shared_memory_behavior.py +0 -0
  124. {fleet_python-0.2.110 → fleet_python-0.2.112}/tests/test_verifier_from_string.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fleet-python
3
- Version: 0.2.110
3
+ Version: 0.2.112
4
4
  Summary: Python SDK for Fleet environments
5
5
  Author-email: Fleet AI <nic@fleet.so>
6
6
  License: Apache-2.0
@@ -68,12 +68,15 @@ from .tasks import (
68
68
  # Import shared types
69
69
  from .types import VerifierFunction
70
70
 
71
+ # Import judge data classes
72
+ from .judge import Rubric, Criterion, Image, JudgeResult
73
+
71
74
  # Create a module-level env attribute for convenient access
72
75
  from . import env
73
76
  from . import global_client as _global_client
74
77
  from ._async import global_client as _async_global_client
75
78
 
76
- __version__ = "0.2.110"
79
+ __version__ = "0.2.112"
77
80
 
78
81
  __all__ = [
79
82
  # Core classes
@@ -90,6 +93,11 @@ __all__ = [
90
93
  # Task models
91
94
  "Task",
92
95
  "VerifierFunction",
96
+ # Judge
97
+ "Rubric",
98
+ "Criterion",
99
+ "Image",
100
+ "JudgeResult",
93
101
  # Exceptions
94
102
  "FleetError",
95
103
  "FleetAPIError",
@@ -44,7 +44,7 @@ from ..types import VerifierFunction
44
44
  from .. import env
45
45
  from . import global_client as _async_global_client
46
46
 
47
- __version__ = "0.2.110"
47
+ __version__ = "0.2.112"
48
48
 
49
49
  __all__ = [
50
50
  # Core classes
@@ -26,7 +26,7 @@ from .exceptions import (
26
26
  try:
27
27
  from .. import __version__
28
28
  except ImportError:
29
- __version__ = "0.2.110"
29
+ __version__ = "0.2.112"
30
30
 
31
31
  logger = logging.getLogger(__name__)
32
32
 
@@ -54,6 +54,7 @@ from .tasks import Task
54
54
 
55
55
  if TYPE_CHECKING:
56
56
  from .verifiers import AsyncVerifierFunction
57
+ from .judge import AsyncJudge
57
58
 
58
59
 
59
60
  def _json_default(x: Any) -> Any:
@@ -344,6 +345,7 @@ class AsyncEnv(EnvironmentBase):
344
345
  self._client = client
345
346
  self._apps: Dict[str, AsyncInstanceClient] = {}
346
347
  self._instance: Optional[AsyncInstanceClient] = None
348
+ self._judge: Optional["AsyncJudge"] = None
347
349
 
348
350
  @property
349
351
  def instance(self) -> AsyncInstanceClient:
@@ -419,6 +421,18 @@ class AsyncEnv(EnvironmentBase):
419
421
  mcp_url = f"{self.urls.root}mcp"
420
422
  return AsyncMCPResource(url=mcp_url, env_key=self.env_key)
421
423
 
424
+ @property
425
+ def judge(self) -> "AsyncJudge":
426
+ """LLM-as-judge grading via orchestrator API."""
427
+ if self._judge is None:
428
+ from .judge import AsyncJudge
429
+
430
+ self._judge = AsyncJudge(
431
+ client=self._load_client,
432
+ instance_id=self.instance_id,
433
+ )
434
+ return self._judge
435
+
422
436
  def state(self, uri: str) -> Resource:
423
437
  return self.instance.state(uri)
424
438
 
@@ -0,0 +1,96 @@
1
+ """Fleet SDK Judge - Async version.
2
+
3
+ Provides env.judge.grade() for async verifier scripts.
4
+ """
5
+
6
+ from typing import Dict, List, Optional, Union, TYPE_CHECKING
7
+
8
+ # Import shared classes and helpers from the sync module
9
+ from ..judge import (
10
+ Criterion,
11
+ Image,
12
+ JudgeResult,
13
+ Rubric,
14
+ _build_grade_request,
15
+ _parse_grade_response,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from .base import AsyncWrapper
20
+
21
+ # Re-export data classes so `from fleet._async.judge import ...` works
22
+ __all__ = [
23
+ "AsyncJudge",
24
+ "Criterion",
25
+ "Image",
26
+ "JudgeResult",
27
+ "Rubric",
28
+ ]
29
+
30
+
31
+ class AsyncJudge:
32
+ """LLM-as-judge grading — calls orchestrator API, not environment API.
33
+
34
+ Accessed as env.judge on AsyncEnv instances.
35
+ """
36
+
37
+ def __init__(self, client: "AsyncWrapper", instance_id: str):
38
+ self._client = client
39
+ self._instance_id = instance_id
40
+
41
+ async def grade(
42
+ self,
43
+ rubric: Union[str, Rubric],
44
+ submission: Optional[str] = None,
45
+ *,
46
+ ground_truth: Optional[Union[str, dict]] = None,
47
+ problem: Optional[str] = None,
48
+ context: Optional[str] = None,
49
+ reference_claims: Optional[str] = None,
50
+ conversation: Optional[List[dict]] = None,
51
+ images: Optional[Dict[str, Image]] = None,
52
+ model: Optional[str] = None,
53
+ provider: Optional[str] = None,
54
+ agentic: bool = False,
55
+ collect: Optional[Dict[str, List[str]]] = None,
56
+ task_id: Optional[str] = None,
57
+ ) -> JudgeResult:
58
+ """Grade a submission using LLM-as-judge via the orchestrator API.
59
+
60
+ Returns a JudgeResult (float subclass with .details, .criteria, .feedback)
61
+ that can be returned directly from a verifier function.
62
+
63
+ Args:
64
+ rubric: Grading rubric — either a string or a structured Rubric object.
65
+ submission: The agent's final answer / submission text.
66
+ ground_truth: Expected answer (string or dict).
67
+ problem: The original problem statement.
68
+ context: Additional context for the judge.
69
+ reference_claims: Reference analysis claims.
70
+ conversation: Conversation history as list of message dicts.
71
+ images: Named images for the judge (e.g., gold reference, agent output).
72
+ model: Override LLM model (server picks default if None).
73
+ provider: Override LLM provider (server picks default if None).
74
+ agentic: If True, the orchestrator collects artifacts from the instance.
75
+ collect: File patterns for orchestrator to collect (agentic mode).
76
+ task_id: Optional task ID for tracking.
77
+ """
78
+ body = _build_grade_request(
79
+ self._instance_id,
80
+ rubric,
81
+ submission,
82
+ ground_truth=ground_truth,
83
+ problem=problem,
84
+ context=context,
85
+ reference_claims=reference_claims,
86
+ conversation=conversation,
87
+ images=images,
88
+ model=model,
89
+ provider=provider,
90
+ agentic=agentic,
91
+ collect=collect,
92
+ task_id=task_id,
93
+ )
94
+
95
+ response = await self._client.request("POST", "/v1/judge/grade", json=body)
96
+ return _parse_grade_response(response.json())
@@ -301,6 +301,14 @@ class AsyncFilesystemResource(Resource):
301
301
  response = await self.client.request(
302
302
  "POST", "/fs/file", json=request.model_dump()
303
303
  )
304
+ if response.status_code == 404:
305
+ return FileStateResponse(
306
+ success=True, path=path, exists=False,
307
+ message=response.json().get("detail", "File not found"),
308
+ )
309
+ if response.status_code >= 400:
310
+ detail = response.json().get("detail", response.text)
311
+ raise RuntimeError(f"Failed to get file state for '{path}': {detail}")
304
312
  return FileStateResponse(**response.json())
305
313
 
306
314
  async def file_text(self, path: str, max_content_size: int = 102400) -> str:
@@ -27,7 +27,7 @@ from .exceptions import (
27
27
  try:
28
28
  from . import __version__
29
29
  except ImportError:
30
- __version__ = "0.2.110"
30
+ __version__ = "0.2.112"
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
@@ -59,6 +59,7 @@ from .tasks import Task
59
59
 
60
60
  if TYPE_CHECKING:
61
61
  from .verifiers import SyncVerifierFunction
62
+ from .judge import SyncJudge
62
63
 
63
64
 
64
65
  def _json_default(x: Any) -> Any:
@@ -348,6 +349,7 @@ class SyncEnv(EnvironmentBase):
348
349
  self._client = client
349
350
  self._apps: Dict[str, InstanceClient] = {}
350
351
  self._instance: Optional[InstanceClient] = None
352
+ self._judge: Optional["SyncJudge"] = None
351
353
  self._manager_url_override: Optional[str] = None # For URL mode
352
354
 
353
355
  @property
@@ -431,6 +433,18 @@ class SyncEnv(EnvironmentBase):
431
433
  mcp_url = f"{self.urls.root}mcp"
432
434
  return SyncMCPResource(url=mcp_url, env_key=self.env_key)
433
435
 
436
+ @property
437
+ def judge(self) -> "SyncJudge":
438
+ """LLM-as-judge grading via orchestrator API."""
439
+ if self._judge is None:
440
+ from .judge import SyncJudge
441
+
442
+ self._judge = SyncJudge(
443
+ client=self._load_client,
444
+ instance_id=self.instance_id,
445
+ )
446
+ return self._judge
447
+
434
448
  def state(self, uri: str) -> Resource:
435
449
  return self.instance.state(uri)
436
450
 
@@ -0,0 +1,521 @@
1
+ """Fleet SDK Judge - LLM-as-Judge grading via orchestrator API.
2
+
3
+ Provides env.judge.grade() for verifier scripts to grade submissions
4
+ using LLM judges without managing API keys, HTTP calls, or response parsing.
5
+
6
+ All LLM calls happen server-side on the orchestrator — the SDK just sends
7
+ the rubric, submission, and artifacts, and gets back a score.
8
+ """
9
+
10
+ import base64
11
+ import json
12
+ import logging
13
+ import os
14
+ from dataclasses import dataclass, field
15
+ from typing import Any, Dict, List, Optional, Union, TYPE_CHECKING
16
+
17
+ if TYPE_CHECKING:
18
+ from .base import SyncWrapper
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Data classes (used by both sync and async)
25
+ # ---------------------------------------------------------------------------
26
+
27
+
28
+ def _guess_media_type(filename: str) -> str:
29
+ """Guess media type from filename extension."""
30
+ ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
31
+ return {
32
+ "png": "image/png",
33
+ "jpg": "image/jpeg",
34
+ "jpeg": "image/jpeg",
35
+ "gif": "image/gif",
36
+ "webp": "image/webp",
37
+ "svg": "image/svg+xml",
38
+ }.get(ext, "image/png")
39
+
40
+
41
+ @dataclass
42
+ class Criterion:
43
+ """A single rubric criterion for grading.
44
+
45
+ Args:
46
+ name: Name of this criterion (e.g., "Takeaway Alignment")
47
+ max: Maximum points for this criterion
48
+ levels: Optional mapping of score -> description for each level.
49
+ Rendered into the description string for the API.
50
+ description: Optional freeform description (alternative to levels)
51
+ """
52
+
53
+ name: str
54
+ max: int
55
+ levels: Optional[Dict[int, str]] = None
56
+ description: Optional[str] = None
57
+
58
+ def _render_description(self) -> str:
59
+ """Render levels dict + description into a single description string."""
60
+ parts = []
61
+ if self.levels:
62
+ for score in sorted(self.levels.keys(), reverse=True):
63
+ parts.append(f"- {score} points: {self.levels[score]}")
64
+ if self.description:
65
+ parts.append(self.description)
66
+ return "\n".join(parts) if parts else self.name
67
+
68
+ def serialize(self) -> dict:
69
+ return {
70
+ "name": self.name,
71
+ "max_score": self.max,
72
+ "description": self._render_description(),
73
+ }
74
+
75
+
76
+ @dataclass
77
+ class Rubric:
78
+ """Structured grading rubric.
79
+
80
+ Args:
81
+ criteria: List of Criterion objects
82
+ system_prompt: Optional override for the judge system prompt
83
+ """
84
+
85
+ criteria: List[Criterion] = field(default_factory=list)
86
+ system_prompt: Optional[str] = None
87
+
88
+ @property
89
+ def max_score(self) -> int:
90
+ return sum(c.max for c in self.criteria)
91
+
92
+ def serialize(self) -> dict:
93
+ d: dict = {
94
+ "type": "structured",
95
+ "criteria": [c.serialize() for c in self.criteria],
96
+ }
97
+ if self.system_prompt is not None:
98
+ d["system_prompt"] = self.system_prompt
99
+ return d
100
+
101
+
102
+ class Image:
103
+ """Reference to an image for LLM judge grading.
104
+
105
+ Use the static constructors to create instances:
106
+ Image.s3("s3://bucket/key") - S3 URL, fetched server-side
107
+ Image.from_url("https://...") - HTTP URL, fetched server-side
108
+ Image.from_base64(data, "file.png") - Inline base64 data
109
+ Image.from_env(env, "plot.png") - Collect from environment
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ *,
115
+ source: str,
116
+ url: Optional[str] = None,
117
+ data: Optional[str] = None,
118
+ filename: Optional[str] = None,
119
+ media_type: Optional[str] = None,
120
+ _env: Optional[Any] = None,
121
+ ):
122
+ self.source = source
123
+ self.url = url
124
+ self.data = data
125
+ self.filename = filename
126
+ self.media_type = media_type
127
+ self._env = _env
128
+
129
+ @staticmethod
130
+ def s3(url: str, media_type: Optional[str] = None) -> "Image":
131
+ """Reference an image in S3. The orchestrator fetches it server-side."""
132
+ return Image(source="s3", url=url, media_type=media_type)
133
+
134
+ @staticmethod
135
+ def from_url(url: str, media_type: Optional[str] = None) -> "Image":
136
+ """Reference an image by HTTP URL. The orchestrator fetches it server-side."""
137
+ return Image(source="url", url=url, media_type=media_type)
138
+
139
+ @staticmethod
140
+ def from_base64(
141
+ data: str, filename: str = "image.png", media_type: Optional[str] = None
142
+ ) -> "Image":
143
+ """Inline base64 image data."""
144
+ return Image(
145
+ source="base64",
146
+ data=data,
147
+ filename=filename,
148
+ media_type=media_type or _guess_media_type(filename),
149
+ )
150
+
151
+ @staticmethod
152
+ def from_env(env: Any, filename: str) -> "Image":
153
+ """Collect an image from the environment.
154
+
155
+ In non-agentic mode, the SDK collects the image client-side (DB -> notebook -> filesystem)
156
+ and sends base64 to the orchestrator.
157
+
158
+ In agentic mode, only the filename hint is sent and the orchestrator collects it.
159
+ """
160
+ return Image(source="env", filename=filename, _env=env)
161
+
162
+ def serialize(self, *, label: Optional[str] = None, agentic: bool = False) -> dict:
163
+ """Serialize for the orchestrator API request body."""
164
+ d: dict
165
+ if self.source == "s3":
166
+ d = {"source": "s3", "url": self.url}
167
+ if self.media_type:
168
+ d["media_type"] = self.media_type
169
+ elif self.source == "url":
170
+ d = {"source": "url", "url": self.url}
171
+ if self.media_type:
172
+ d["media_type"] = self.media_type
173
+ elif self.source == "base64":
174
+ d = {
175
+ "source": "base64",
176
+ "data": self.data,
177
+ "media_type": self.media_type or _guess_media_type(self.filename or "image.png"),
178
+ }
179
+ elif self.source == "env":
180
+ if agentic:
181
+ d = {"source": "collect", "selector": self.filename}
182
+ else:
183
+ b64 = _collect_image_from_env(self._env, self.filename)
184
+ if b64 is None:
185
+ d = {"source": "collect", "selector": self.filename}
186
+ else:
187
+ d = {
188
+ "source": "base64",
189
+ "data": b64,
190
+ "media_type": _guess_media_type(self.filename or "image.png"),
191
+ }
192
+ else:
193
+ raise ValueError(f"Unknown image source: {self.source}")
194
+
195
+ if label is not None:
196
+ d["label"] = label
197
+ return d
198
+
199
+
200
+ class JudgeResult(float):
201
+ """Float subclass that carries grading details.
202
+
203
+ Can be returned directly from a verifier function (it IS a float),
204
+ but also carries structured metadata from the judge response.
205
+ """
206
+
207
+ def __new__(cls, score: float, *, details: Optional[dict] = None):
208
+ instance = super().__new__(cls, score)
209
+ instance.details = details or {} # type: ignore[attr-defined]
210
+ instance.criteria = instance.details.get("criteria", []) # type: ignore[attr-defined]
211
+ instance.feedback = instance.details.get("feedback", "") # type: ignore[attr-defined]
212
+ instance.execution_id = instance.details.get("execution_id", "") # type: ignore[attr-defined]
213
+ return instance
214
+
215
+
216
+ # ---------------------------------------------------------------------------
217
+ # Image collection helpers
218
+ # ---------------------------------------------------------------------------
219
+
220
+
221
+ def _extract_query_rows(result: Any) -> List[Dict[str, Any]]:
222
+ """Extract rows from a query response, handling various formats."""
223
+ if result is None:
224
+ return []
225
+ # QueryResponse with columns/rows
226
+ cols = getattr(result, "columns", None)
227
+ rows = getattr(result, "rows", None)
228
+ if isinstance(cols, list) and isinstance(rows, list):
229
+ return [
230
+ {str(cols[i]): row[i] for i in range(min(len(cols), len(row)))}
231
+ if isinstance(row, (list, tuple))
232
+ else row
233
+ for row in rows
234
+ if isinstance(row, (list, tuple, dict))
235
+ ]
236
+ # Dict with columns/rows
237
+ if isinstance(result, dict):
238
+ cols = result.get("columns")
239
+ rows = result.get("rows")
240
+ if isinstance(cols, list) and isinstance(rows, list):
241
+ return [
242
+ {str(cols[i]): row[i] for i in range(min(len(cols), len(row)))}
243
+ if isinstance(row, (list, tuple))
244
+ else row
245
+ for row in rows
246
+ if isinstance(row, (list, tuple, dict))
247
+ ]
248
+ # Plain list of dicts
249
+ if isinstance(result, list):
250
+ return [row for row in result if isinstance(row, dict)]
251
+ return []
252
+
253
+
254
+ def _collect_image_from_env(env: Any, filename: str) -> Optional[str]:
255
+ """Collect an image from the environment using DB -> notebook -> filesystem strategies.
256
+
257
+ Returns base64-encoded image data, or None if not found.
258
+ """
259
+ # Strategy 1: DB files table
260
+ try:
261
+ current = env.db("current")
262
+ where = f"path = '{filename}' OR path LIKE '%/{filename}'"
263
+ rows = _extract_query_rows(
264
+ current.query(f"SELECT path, hex(content) AS content_hex FROM files WHERE {where}")
265
+ )
266
+ candidates = {}
267
+ for row in rows:
268
+ path, chex = row.get("path", ""), row.get("content_hex", "")
269
+ if path and chex:
270
+ try:
271
+ candidates[path] = bytes.fromhex(chex)
272
+ except Exception:
273
+ pass
274
+ # Prefer non-dataroom paths
275
+ non_dr = [p for p in candidates if not p.startswith("dataroom/")]
276
+ best = sorted(non_dr or list(candidates.keys()), key=len)
277
+ if best:
278
+ logger.debug("Loaded image from DB: %s", best[0])
279
+ return base64.b64encode(candidates[best[0]]).decode()
280
+ except Exception as e:
281
+ logger.debug("DB image query failed: %s", e)
282
+
283
+ # Strategy 2: Notebook cell outputs
284
+ try:
285
+ current = env.db("current")
286
+ nb_rows = _extract_query_rows(
287
+ current.query(
288
+ "SELECT path, hex(content) AS content_hex FROM files "
289
+ "WHERE path LIKE 'notebooks/%.ipynb'"
290
+ )
291
+ )
292
+ for nb_row in nb_rows:
293
+ chex = nb_row.get("content_hex", "")
294
+ if not chex:
295
+ continue
296
+ try:
297
+ nb_bytes = bytes.fromhex(chex)
298
+ nb = json.loads(nb_bytes.decode("utf-8"))
299
+ for cell in reversed(nb.get("cells", [])):
300
+ for output in cell.get("outputs", []):
301
+ if output.get("output_type") in ("display_data", "execute_result"):
302
+ img_data = output.get("data", {}).get("image/png")
303
+ if img_data:
304
+ if isinstance(img_data, list):
305
+ img_data = "".join(img_data)
306
+ img_data = img_data.strip()
307
+ if img_data:
308
+ logger.debug("Loaded image from notebook: %s", nb_row.get("path"))
309
+ return img_data
310
+ except Exception:
311
+ pass
312
+ except Exception as e:
313
+ logger.debug("Notebook image query failed: %s", e)
314
+
315
+ # Strategy 3: Filesystem fallback
316
+ search_paths = [
317
+ filename,
318
+ f"/app/workspace/{filename}",
319
+ f"/workspace/{filename}",
320
+ ]
321
+ for fp in search_paths:
322
+ try:
323
+ if os.path.exists(fp):
324
+ with open(fp, "rb") as f:
325
+ logger.debug("Loaded image from filesystem: %s", fp)
326
+ return base64.b64encode(f.read()).decode()
327
+ except Exception:
328
+ pass
329
+
330
+ return None
331
+
332
+
333
+ # ---------------------------------------------------------------------------
334
+ # Accumulator printing (verifier protocol)
335
+ # ---------------------------------------------------------------------------
336
+
337
+
338
+ def _print_accumulators(data: dict) -> None:
339
+ """Print error/success accumulators from orchestrator response (verifier protocol)."""
340
+ acc = data.get("accumulators")
341
+ if not acc:
342
+ return
343
+
344
+ errors = acc.get("errors")
345
+ if errors:
346
+ print("[STDOUT] >>> ERROR_ACCUMULATOR >>>")
347
+ print(json.dumps(errors))
348
+ print("<<< ERROR_ACCUMULATOR <<<")
349
+
350
+ successes = acc.get("successes")
351
+ if successes:
352
+ print(">>> SUCCESS_ACCUMULATOR >>>")
353
+ print(json.dumps(successes))
354
+ print("<<< SUCCESS_ACCUMULATOR <<<")
355
+
356
+ grading_details = acc.get("grading_details")
357
+ if grading_details:
358
+ print(">>> GRADING_DETAILS >>>")
359
+ print(json.dumps(grading_details))
360
+ print("<<< GRADING_DETAILS <<<")
361
+
362
+ timing = acc.get("timing")
363
+ if timing:
364
+ print(
365
+ f">>> TIMING: started={timing.get('started_ms')}, "
366
+ f"finished={timing.get('finished_ms')}, "
367
+ f"duration={timing.get('duration_ms')}ms <<<"
368
+ )
369
+
370
+
371
+ # ---------------------------------------------------------------------------
372
+ # Request body builder (shared by sync and async)
373
+ # ---------------------------------------------------------------------------
374
+
375
+
376
+ def _build_grade_request(
377
+ instance_id: str,
378
+ rubric: Union[str, Rubric],
379
+ submission: Optional[str],
380
+ *,
381
+ ground_truth: Optional[Union[str, dict]] = None,
382
+ problem: Optional[str] = None,
383
+ context: Optional[str] = None,
384
+ reference_claims: Optional[str] = None,
385
+ conversation: Optional[List[dict]] = None,
386
+ images: Optional[Dict[str, Image]] = None,
387
+ model: Optional[str] = None,
388
+ provider: Optional[str] = None,
389
+ agentic: bool = False,
390
+ collect: Optional[Dict[str, List[str]]] = None,
391
+ task_id: Optional[str] = None,
392
+ ) -> dict:
393
+ """Build the JSON request body for POST /v1/judge/grade."""
394
+ body: Dict[str, Any] = {
395
+ "instance_id": instance_id,
396
+ "submission": submission,
397
+ "agentic": agentic,
398
+ }
399
+
400
+ # Rubric
401
+ if isinstance(rubric, str):
402
+ body["rubric"] = {"type": "string", "text": rubric}
403
+ elif isinstance(rubric, Rubric):
404
+ body["rubric"] = rubric.serialize()
405
+ else:
406
+ raise TypeError(f"rubric must be str or Rubric, got {type(rubric)}")
407
+
408
+ # Optional fields
409
+ if ground_truth is not None:
410
+ body["ground_truth"] = ground_truth
411
+ if problem is not None:
412
+ body["problem"] = problem
413
+ if reference_claims is not None:
414
+ # Fold reference_claims into context
415
+ if context:
416
+ context = f"{context}\n\n## Reference Claims\n{reference_claims}"
417
+ else:
418
+ context = f"## Reference Claims\n{reference_claims}"
419
+ if context is not None:
420
+ body["context"] = context
421
+ if conversation is not None:
422
+ body["conversation"] = [
423
+ {"role": m["role"], "content": m["content"]} for m in conversation
424
+ ]
425
+ if model is not None:
426
+ body["model"] = model
427
+ if provider is not None:
428
+ body["provider"] = provider
429
+ if task_id is not None:
430
+ body["task_id"] = task_id
431
+ if collect is not None:
432
+ body["collect"] = collect
433
+
434
+ # Serialize images as labeled array
435
+ if images:
436
+ body["images"] = [
437
+ img.serialize(label=label, agentic=agentic)
438
+ for label, img in images.items()
439
+ ]
440
+
441
+ return body
442
+
443
+
444
+ def _parse_grade_response(data: dict) -> JudgeResult:
445
+ """Parse orchestrator response into JudgeResult and print accumulators."""
446
+ _print_accumulators(data)
447
+ score = float(data.get("normalized_score", 0.0))
448
+ return JudgeResult(score, details=data)
449
+
450
+
451
+ # ---------------------------------------------------------------------------
452
+ # Sync judge
453
+ # ---------------------------------------------------------------------------
454
+
455
+
456
+ class SyncJudge:
457
+ """LLM-as-judge grading — calls orchestrator API, not environment API.
458
+
459
+ Accessed as env.judge on SyncEnv instances.
460
+ """
461
+
462
+ def __init__(self, client: "SyncWrapper", instance_id: str):
463
+ self._client = client
464
+ self._instance_id = instance_id
465
+
466
+ def grade(
467
+ self,
468
+ rubric: Union[str, Rubric],
469
+ submission: Optional[str] = None,
470
+ *,
471
+ ground_truth: Optional[Union[str, dict]] = None,
472
+ problem: Optional[str] = None,
473
+ context: Optional[str] = None,
474
+ reference_claims: Optional[str] = None,
475
+ conversation: Optional[List[dict]] = None,
476
+ images: Optional[Dict[str, Image]] = None,
477
+ model: Optional[str] = None,
478
+ provider: Optional[str] = None,
479
+ agentic: bool = False,
480
+ collect: Optional[Dict[str, List[str]]] = None,
481
+ task_id: Optional[str] = None,
482
+ ) -> JudgeResult:
483
+ """Grade a submission using LLM-as-judge via the orchestrator API.
484
+
485
+ Returns a JudgeResult (float subclass with .details, .criteria, .feedback)
486
+ that can be returned directly from a verifier function.
487
+
488
+ Args:
489
+ rubric: Grading rubric — either a string or a structured Rubric object.
490
+ submission: The agent's final answer / submission text.
491
+ ground_truth: Expected answer (string or dict).
492
+ problem: The original problem statement.
493
+ context: Additional context for the judge.
494
+ reference_claims: Reference analysis claims (folded into context).
495
+ conversation: Conversation history as list of message dicts.
496
+ images: List of Image objects for the judge.
497
+ model: Override LLM model (server picks default if None).
498
+ provider: Override LLM provider (server picks default if None).
499
+ agentic: If True, the orchestrator collects artifacts from the instance.
500
+ collect: File patterns for orchestrator to collect (agentic mode).
501
+ task_id: Optional task ID for tracking.
502
+ """
503
+ body = _build_grade_request(
504
+ self._instance_id,
505
+ rubric,
506
+ submission,
507
+ ground_truth=ground_truth,
508
+ problem=problem,
509
+ context=context,
510
+ reference_claims=reference_claims,
511
+ conversation=conversation,
512
+ images=images,
513
+ model=model,
514
+ provider=provider,
515
+ agentic=agentic,
516
+ collect=collect,
517
+ task_id=task_id,
518
+ )
519
+
520
+ response = self._client.request("POST", "/v1/judge/grade", json=body)
521
+ return _parse_grade_response(response.json())
@@ -301,6 +301,14 @@ class FilesystemResource(Resource):
301
301
  response = self.client.request(
302
302
  "POST", "/fs/file", json=request.model_dump()
303
303
  )
304
+ if response.status_code == 404:
305
+ return FileStateResponse(
306
+ success=True, path=path, exists=False,
307
+ message=response.json().get("detail", "File not found"),
308
+ )
309
+ if response.status_code >= 400:
310
+ detail = response.json().get("detail", response.text)
311
+ raise RuntimeError(f"Failed to get file state for '{path}': {detail}")
304
312
  return FileStateResponse(**response.json())
305
313
 
306
314
  def file_text(self, path: str, max_content_size: int = 102400) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fleet-python
3
- Version: 0.2.110
3
+ Version: 0.2.112
4
4
  Summary: Python SDK for Fleet environments
5
5
  Author-email: Fleet AI <nic@fleet.so>
6
6
  License: Apache-2.0
@@ -34,6 +34,7 @@ fleet/client.py
34
34
  fleet/config.py
35
35
  fleet/exceptions.py
36
36
  fleet/global_client.py
37
+ fleet/judge.py
37
38
  fleet/models.py
38
39
  fleet/tasks.py
39
40
  fleet/types.py
@@ -42,6 +43,7 @@ fleet/_async/base.py
42
43
  fleet/_async/client.py
43
44
  fleet/_async/exceptions.py
44
45
  fleet/_async/global_client.py
46
+ fleet/_async/judge.py
45
47
  fleet/_async/models.py
46
48
  fleet/_async/tasks.py
47
49
  fleet/_async/env/__init__.py
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
  [project]
6
6
  name = "fleet-python"
7
7
 
8
- version = "0.2.110"
8
+ version = "0.2.112"
9
9
  description = "Python SDK for Fleet environments"
10
10
  authors = [
11
11
  {name = "Fleet AI", email = "nic@fleet.so"},
File without changes
File without changes
File without changes