fleet-python 0.2.115__tar.gz → 0.2.116__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {fleet_python-0.2.115/fleet_python.egg-info → fleet_python-0.2.116}/PKG-INFO +1 -1
  2. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/__init__.py +1 -1
  3. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/__init__.py +1 -1
  4. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/base.py +1 -1
  5. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/client.py +0 -2
  6. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/base.py +1 -1
  7. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/client.py +0 -2
  8. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/judge.py +54 -0
  9. {fleet_python-0.2.115 → fleet_python-0.2.116/fleet_python.egg-info}/PKG-INFO +1 -1
  10. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/SOURCES.txt +1 -0
  11. {fleet_python-0.2.115 → fleet_python-0.2.116}/pyproject.toml +1 -1
  12. fleet_python-0.2.116/tests/test_judge_criteria_markers.py +192 -0
  13. {fleet_python-0.2.115 → fleet_python-0.2.116}/LICENSE +0 -0
  14. {fleet_python-0.2.115 → fleet_python-0.2.116}/README.md +0 -0
  15. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/diff_example.py +0 -0
  16. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/dsl_example.py +0 -0
  17. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example.py +0 -0
  18. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/exampleResume.py +0 -0
  19. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_account.py +0 -0
  20. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_action_log.py +0 -0
  21. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_client.py +0 -0
  22. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_mcp_anthropic.py +0 -0
  23. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_mcp_openai.py +0 -0
  24. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_sync.py +0 -0
  25. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_task.py +0 -0
  26. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_tasks.py +0 -0
  27. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_verifier.py +0 -0
  28. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/export_tasks.py +0 -0
  29. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/export_tasks_filtered.py +0 -0
  30. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/fetch_tasks.py +0 -0
  31. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/gemini_example.py +0 -0
  32. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/import_tasks.py +0 -0
  33. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/iterate_verifiers.py +0 -0
  34. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/json_tasks_example.py +0 -0
  35. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/nova_act_example.py +0 -0
  36. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/openai_example.py +0 -0
  37. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/openai_simple_example.py +0 -0
  38. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/query_builder_example.py +0 -0
  39. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/quickstart.py +0 -0
  40. {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/test_cdp_logging.py +0 -0
  41. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/env/__init__.py +0 -0
  42. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/env/client.py +0 -0
  43. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/exceptions.py +0 -0
  44. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/global_client.py +0 -0
  45. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/__init__.py +0 -0
  46. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/base.py +0 -0
  47. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/client.py +0 -0
  48. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/judge.py +0 -0
  49. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/models.py +0 -0
  50. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/__init__.py +0 -0
  51. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/api.py +0 -0
  52. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/base.py +0 -0
  53. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/browser.py +0 -0
  54. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/filesystem.py +0 -0
  55. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/mcp.py +0 -0
  56. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/sqlite.py +0 -0
  57. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/tasks.py +0 -0
  58. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/__init__.py +0 -0
  59. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/bundler.py +0 -0
  60. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/verifier.py +0 -0
  61. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/__init__.py +0 -0
  62. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/Dockerfile +0 -0
  63. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/__init__.py +0 -0
  64. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/agent.py +0 -0
  65. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp/main.py +0 -0
  66. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
  67. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
  68. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
  69. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/requirements.txt +0 -0
  70. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/start.sh +0 -0
  71. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/orchestrator.py +0 -0
  72. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/types.py +0 -0
  73. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/utils.py +0 -0
  74. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/cli.py +0 -0
  75. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/config.py +0 -0
  76. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/env/__init__.py +0 -0
  77. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/env/client.py +0 -0
  78. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/eval/__init__.py +0 -0
  79. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/eval/uploader.py +0 -0
  80. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/exceptions.py +0 -0
  81. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/global_client.py +0 -0
  82. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/__init__.py +0 -0
  83. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/base.py +0 -0
  84. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/client.py +0 -0
  85. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/models.py +0 -0
  86. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/models.py +0 -0
  87. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/__init__.py +0 -0
  88. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/proxy.py +0 -0
  89. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/whitelist.py +0 -0
  90. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/__init__.py +0 -0
  91. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/api.py +0 -0
  92. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/base.py +0 -0
  93. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/browser.py +0 -0
  94. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/filesystem.py +0 -0
  95. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/mcp.py +0 -0
  96. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/sqlite.py +0 -0
  97. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/tasks.py +0 -0
  98. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/types.py +0 -0
  99. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/__init__.py +0 -0
  100. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/http_logging.py +0 -0
  101. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/logging.py +0 -0
  102. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/playwright.py +0 -0
  103. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/__init__.py +0 -0
  104. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/bundler.py +0 -0
  105. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/code.py +0 -0
  106. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/db.py +0 -0
  107. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/decorator.py +0 -0
  108. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/parse.py +0 -0
  109. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/sql_differ.py +0 -0
  110. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/verifier.py +0 -0
  111. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/dependency_links.txt +0 -0
  112. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/entry_points.txt +0 -0
  113. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/requires.txt +0 -0
  114. {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/top_level.txt +0 -0
  115. {fleet_python-0.2.115 → fleet_python-0.2.116}/scripts/fix_sync_imports.py +0 -0
  116. {fleet_python-0.2.115 → fleet_python-0.2.116}/scripts/unasync.py +0 -0
  117. {fleet_python-0.2.115 → fleet_python-0.2.116}/setup.cfg +0 -0
  118. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/__init__.py +0 -0
  119. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_app_method.py +0 -0
  120. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_expect_exactly.py +0 -0
  121. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_expect_only.py +0 -0
  122. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_instance_dispatch.py +0 -0
  123. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_sqlite_resource_dual_mode.py +0 -0
  124. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_sqlite_shared_memory_behavior.py +0 -0
  125. {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_verifier_from_string.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fleet-python
3
- Version: 0.2.115
3
+ Version: 0.2.116
4
4
  Summary: Python SDK for Fleet environments
5
5
  Author-email: Fleet AI <nic@fleet.so>
6
6
  License: Apache-2.0
@@ -76,7 +76,7 @@ from . import env
76
76
  from . import global_client as _global_client
77
77
  from ._async import global_client as _async_global_client
78
78
 
79
- __version__ = "0.2.115"
79
+ __version__ = "0.2.116"
80
80
 
81
81
  __all__ = [
82
82
  # Core classes
@@ -44,7 +44,7 @@ from ..types import VerifierFunction
44
44
  from .. import env
45
45
  from . import global_client as _async_global_client
46
46
 
47
- __version__ = "0.2.115"
47
+ __version__ = "0.2.116"
48
48
 
49
49
  __all__ = [
50
50
  # Core classes
@@ -26,7 +26,7 @@ from .exceptions import (
26
26
  try:
27
27
  from .. import __version__
28
28
  except ImportError:
29
- __version__ = "0.2.115"
29
+ __version__ = "0.2.116"
30
30
 
31
31
  logger = logging.getLogger(__name__)
32
32
 
@@ -601,7 +601,6 @@ class AsyncFleet:
601
601
  )
602
602
 
603
603
  instance = AsyncEnv(client=self.client, **response.json())
604
- await instance.instance.load()
605
604
  return instance
606
605
 
607
606
  async def make_for_task(self, task: Task) -> AsyncEnv:
@@ -653,7 +652,6 @@ class AsyncFleet:
653
652
  else:
654
653
  response = await self.client.request("GET", f"/v1/env/instances/{instance_id}")
655
654
  instance = AsyncEnv(client=self.client, **response.json())
656
- await instance.instance.load()
657
655
  return instance
658
656
 
659
657
  def _create_url_instance(self, base_url: str) -> AsyncEnv:
@@ -27,7 +27,7 @@ from .exceptions import (
27
27
  try:
28
28
  from . import __version__
29
29
  except ImportError:
30
- __version__ = "0.2.115"
30
+ __version__ = "0.2.116"
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
@@ -613,7 +613,6 @@ class Fleet:
613
613
  )
614
614
 
615
615
  instance = SyncEnv(client=self.client, **response.json())
616
- instance.instance.load()
617
616
  return instance
618
617
 
619
618
  def make_for_task(self, task: Task) -> SyncEnv:
@@ -665,7 +664,6 @@ class Fleet:
665
664
  else:
666
665
  response = self.client.request("GET", f"/v1/env/instances/{instance_id}")
667
666
  instance = SyncEnv(client=self.client, **response.json())
668
- instance.instance.load()
669
667
  return instance
670
668
 
671
669
  def _create_url_instance(self, base_url: str) -> SyncEnv:
@@ -823,6 +823,54 @@ def _parse_grade_response(data: dict) -> JudgeResult:
823
823
  return JudgeResult(score, details=data)
824
824
 
825
825
 
826
+ def _print_criteria_markers(criteria: list) -> None:
827
+ """Emit ``>>> CRITERIA >>>`` stdout markers for structured criteria display.
828
+
829
+ The orchestrator (theseus PR #1967) scans verifier stdout for these
830
+ markers and wraps the execution result so the client (client PR #1737)
831
+ can render an expandable rubric breakdown.
832
+
833
+ Converts from the orchestrator judge-response format::
834
+
835
+ {"name": str, "score": int, "max_score": int, "reasoning": str}
836
+
837
+ to the client-expected marker format::
838
+
839
+ {"criteria": str, "score": float, "score_out_of": float, "description"?: str}
840
+
841
+ Each criterion's score is normalised to a 0.0–1.0 float using its own
842
+ ``max_score``.
843
+ """
844
+ marker_criteria = []
845
+ for c in criteria:
846
+ name = c.get("name", "")
847
+ cscore = c.get("score", 0)
848
+ cmax = c.get("max_score", 0)
849
+
850
+ # Normalise per-criterion score to 0.0–1.0
851
+ if cmax and float(cmax) > 0:
852
+ norm_score = float(cscore) / float(cmax)
853
+ else:
854
+ norm_score = float(cscore)
855
+
856
+ entry: dict = {
857
+ "criteria": name,
858
+ "score": round(norm_score, 4),
859
+ "score_out_of": 1.0,
860
+ }
861
+
862
+ reasoning = c.get("reasoning", "")
863
+ if reasoning:
864
+ entry["description"] = reasoning
865
+
866
+ marker_criteria.append(entry)
867
+
868
+ if marker_criteria:
869
+ print(">>> CRITERIA >>>")
870
+ print(json.dumps(marker_criteria))
871
+ print("<<< CRITERIA <<<")
872
+
873
+
826
874
  def _print_judge_result(data: dict) -> None:
827
875
  """Print detailed judge grading result for verifier stdout capture."""
828
876
  model = data.get("model_used", "unknown")
@@ -848,6 +896,12 @@ def _print_judge_result(data: dict) -> None:
848
896
  if len(reasoning) > 200:
849
897
  reasoning = reasoning[:200] + "..."
850
898
  print(f"[C] {name}: {cscore}/{cmax} — {reasoning}")
899
+
900
+ # Emit structured criteria via stdout markers so the orchestrator
901
+ # (_extract_criteria_from_stdout) and client can render a rubric
902
+ # breakdown. Schema per element:
903
+ # {"criteria": str, "score": float, "score_out_of": float, "description"?: str}
904
+ _print_criteria_markers(criteria)
851
905
  else:
852
906
  print(f"[C] Score: {normalized:.2f}")
853
907
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fleet-python
3
- Version: 0.2.115
3
+ Version: 0.2.116
4
4
  Summary: Python SDK for Fleet environments
5
5
  Author-email: Fleet AI <nic@fleet.so>
6
6
  License: Apache-2.0
@@ -117,6 +117,7 @@ tests/test_app_method.py
117
117
  tests/test_expect_exactly.py
118
118
  tests/test_expect_only.py
119
119
  tests/test_instance_dispatch.py
120
+ tests/test_judge_criteria_markers.py
120
121
  tests/test_sqlite_resource_dual_mode.py
121
122
  tests/test_sqlite_shared_memory_behavior.py
122
123
  tests/test_verifier_from_string.py
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
  [project]
6
6
  name = "fleet-python"
7
7
 
8
- version = "0.2.115"
8
+ version = "0.2.116"
9
9
  description = "Python SDK for Fleet environments"
10
10
  authors = [
11
11
  {name = "Fleet AI", email = "nic@fleet.so"},
@@ -0,0 +1,192 @@
1
+ """Tests for structured criteria stdout markers in fleet.judge.
2
+
3
+ Validates that _print_criteria_markers emits the correct
4
+ >>> CRITERIA >>> / <<< CRITERIA <<< markers that the orchestrator
5
+ (theseus PR #1967) and client (client PR #1737) expect.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from io import StringIO
11
+ from unittest.mock import patch
12
+
13
+ import pytest
14
+
15
+ from fleet.judge import (
16
+ _print_criteria_markers,
17
+ _print_judge_result,
18
+ _parse_grade_response,
19
+ )
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Helpers
24
+ # ---------------------------------------------------------------------------
25
+
26
+ _MARKER_RE = re.compile(
27
+ r">>> CRITERIA >>>\s*\n(.*?)\n<<< CRITERIA <<<",
28
+ re.DOTALL,
29
+ )
30
+
31
+
32
+ def _capture_print(fn, *args, **kwargs):
33
+ """Capture all print() output from a function call."""
34
+ buf = StringIO()
35
+ with patch("builtins.print", side_effect=lambda *a, **kw: buf.write(" ".join(str(x) for x in a) + "\n")):
36
+ fn(*args, **kwargs)
37
+ return buf.getvalue()
38
+
39
+
40
+ def _extract_criteria_from_stdout(stdout: str):
41
+ """Mirror the orchestrator's extraction logic (theseus PR #1967)."""
42
+ m = _MARKER_RE.search(stdout)
43
+ if not m:
44
+ return None
45
+ parsed = json.loads(m.group(1).strip())
46
+ if isinstance(parsed, list):
47
+ return parsed
48
+ return None
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # _print_criteria_markers
53
+ # ---------------------------------------------------------------------------
54
+
55
+ class TestPrintCriteriaMarkers:
56
+ """Tests for _print_criteria_markers."""
57
+
58
+ def test_basic_criteria_output(self):
59
+ """Emits valid markers with normalised scores."""
60
+ criteria = [
61
+ {"name": "Accuracy", "score": 8, "max_score": 10, "reasoning": "Good job"},
62
+ {"name": "Style", "score": 5, "max_score": 5, "reasoning": "Perfect"},
63
+ ]
64
+ stdout = _capture_print(_print_criteria_markers, criteria)
65
+
66
+ parsed = _extract_criteria_from_stdout(stdout)
67
+ assert parsed is not None, f"Markers not found in stdout:\n{stdout}"
68
+ assert len(parsed) == 2
69
+
70
+ assert parsed[0]["criteria"] == "Accuracy"
71
+ assert parsed[0]["score"] == pytest.approx(0.8, abs=0.01)
72
+ assert parsed[0]["score_out_of"] == 1.0
73
+ assert parsed[0]["description"] == "Good job"
74
+
75
+ assert parsed[1]["criteria"] == "Style"
76
+ assert parsed[1]["score"] == pytest.approx(1.0, abs=0.01)
77
+ assert parsed[1]["score_out_of"] == 1.0
78
+
79
+ def test_zero_max_score_passthrough(self):
80
+ """When max_score is 0, raw score passes through."""
81
+ criteria = [
82
+ {"name": "Metric", "score": 0.75, "max_score": 0},
83
+ ]
84
+ stdout = _capture_print(_print_criteria_markers, criteria)
85
+ parsed = _extract_criteria_from_stdout(stdout)
86
+ assert parsed is not None
87
+ assert parsed[0]["score"] == pytest.approx(0.75, abs=0.01)
88
+
89
+ def test_empty_criteria_no_markers(self):
90
+ """Empty list should produce no markers."""
91
+ stdout = _capture_print(_print_criteria_markers, [])
92
+ assert ">>> CRITERIA >>>" not in stdout
93
+
94
+ def test_reasoning_maps_to_description(self):
95
+ """The 'reasoning' field maps to 'description' in the marker schema."""
96
+ criteria = [
97
+ {"name": "Test", "score": 3, "max_score": 5, "reasoning": "Some reasoning here"},
98
+ ]
99
+ stdout = _capture_print(_print_criteria_markers, criteria)
100
+ parsed = _extract_criteria_from_stdout(stdout)
101
+ assert parsed[0]["description"] == "Some reasoning here"
102
+
103
+ def test_missing_reasoning_no_description(self):
104
+ """When reasoning is empty, description key should be absent."""
105
+ criteria = [
106
+ {"name": "Test", "score": 3, "max_score": 5, "reasoning": ""},
107
+ ]
108
+ stdout = _capture_print(_print_criteria_markers, criteria)
109
+ parsed = _extract_criteria_from_stdout(stdout)
110
+ assert "description" not in parsed[0]
111
+
112
+ def test_output_parseable_by_orchestrator_regex(self):
113
+ """Ensure the output matches the exact regex the orchestrator uses."""
114
+ criteria = [
115
+ {"name": "A", "score": 1, "max_score": 2, "reasoning": "half"},
116
+ ]
117
+ stdout = _capture_print(_print_criteria_markers, criteria)
118
+
119
+ # Use the exact regex from theseus PR #1967
120
+ m = re.search(
121
+ r">>> CRITERIA >>>\s*\n(.*?)\n<<< CRITERIA <<<",
122
+ stdout,
123
+ re.DOTALL,
124
+ )
125
+ assert m is not None, "Output doesn't match orchestrator regex"
126
+ data = json.loads(m.group(1).strip())
127
+ assert isinstance(data, list)
128
+ assert data[0]["criteria"] == "A"
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # _print_judge_result integration
133
+ # ---------------------------------------------------------------------------
134
+
135
+ class TestPrintJudgeResult:
136
+ """Tests for _print_judge_result emitting criteria markers."""
137
+
138
+ def test_criteria_markers_emitted(self):
139
+ """_print_judge_result emits criteria markers when criteria present."""
140
+ data = {
141
+ "model_used": "claude-sonnet",
142
+ "provider_used": "anthropic",
143
+ "total_score": 15,
144
+ "max_score": 20,
145
+ "normalized_score": 0.75,
146
+ "criteria": [
147
+ {"name": "Accuracy", "score": 8, "max_score": 10, "reasoning": "Good"},
148
+ {"name": "Style", "score": 7, "max_score": 10, "reasoning": "Decent"},
149
+ ],
150
+ }
151
+ stdout = _capture_print(_print_judge_result, data)
152
+ parsed = _extract_criteria_from_stdout(stdout)
153
+ assert parsed is not None
154
+ assert len(parsed) == 2
155
+
156
+ def test_no_criteria_no_markers(self):
157
+ """_print_judge_result doesn't emit markers when no criteria."""
158
+ data = {
159
+ "model_used": "claude-sonnet",
160
+ "provider_used": "anthropic",
161
+ "total_score": 0,
162
+ "max_score": 0,
163
+ "normalized_score": 0.5,
164
+ }
165
+ stdout = _capture_print(_print_judge_result, data)
166
+ assert ">>> CRITERIA >>>" not in stdout
167
+
168
+
169
+ # ---------------------------------------------------------------------------
170
+ # _parse_grade_response integration
171
+ # ---------------------------------------------------------------------------
172
+
173
+ class TestParseGradeResponse:
174
+ """Tests for _parse_grade_response emitting criteria markers."""
175
+
176
+ def test_full_flow_emits_markers(self):
177
+ """_parse_grade_response → _print_judge_result → criteria markers."""
178
+ data = {
179
+ "model_used": "claude-sonnet",
180
+ "provider_used": "anthropic",
181
+ "total_score": 9,
182
+ "max_score": 10,
183
+ "normalized_score": 0.9,
184
+ "criteria": [
185
+ {"name": "Completeness", "score": 9, "max_score": 10, "reasoning": "Almost perfect"},
186
+ ],
187
+ }
188
+ stdout = _capture_print(_parse_grade_response, data)
189
+ parsed = _extract_criteria_from_stdout(stdout)
190
+ assert parsed is not None
191
+ assert parsed[0]["criteria"] == "Completeness"
192
+ assert parsed[0]["score"] == pytest.approx(0.9, abs=0.01)
File without changes
File without changes
File without changes