fleet-python 0.2.115__tar.gz → 0.2.116__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fleet_python-0.2.115/fleet_python.egg-info → fleet_python-0.2.116}/PKG-INFO +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/__init__.py +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/__init__.py +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/base.py +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/client.py +0 -2
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/base.py +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/client.py +0 -2
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/judge.py +54 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116/fleet_python.egg-info}/PKG-INFO +1 -1
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/SOURCES.txt +1 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/pyproject.toml +1 -1
- fleet_python-0.2.116/tests/test_judge_criteria_markers.py +192 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/LICENSE +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/README.md +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/diff_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/dsl_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/exampleResume.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_account.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_action_log.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_mcp_anthropic.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_mcp_openai.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_sync.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_task.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/example_verifier.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/export_tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/export_tasks_filtered.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/fetch_tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/gemini_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/import_tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/iterate_verifiers.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/json_tasks_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/nova_act_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/openai_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/openai_simple_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/query_builder_example.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/quickstart.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/examples/test_cdp_logging.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/env/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/env/client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/exceptions.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/global_client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/base.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/instance/client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/judge.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/models.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/api.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/base.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/browser.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/filesystem.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/mcp.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/resources/sqlite.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/bundler.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/_async/verifiers/verifier.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/Dockerfile +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/agent.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp/main.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/main.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/mcp_server/tools.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/requirements.txt +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/gemini_cua/start.sh +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/orchestrator.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/types.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/agent/utils.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/cli.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/config.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/env/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/env/client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/eval/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/eval/uploader.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/exceptions.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/global_client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/base.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/client.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/instance/models.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/models.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/proxy.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/proxy/whitelist.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/api.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/base.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/browser.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/filesystem.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/mcp.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/resources/sqlite.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/tasks.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/types.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/http_logging.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/logging.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/utils/playwright.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/bundler.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/code.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/db.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/decorator.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/parse.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/sql_differ.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet/verifiers/verifier.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/dependency_links.txt +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/entry_points.txt +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/requires.txt +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/fleet_python.egg-info/top_level.txt +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/scripts/fix_sync_imports.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/scripts/unasync.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/setup.cfg +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/__init__.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_app_method.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_expect_exactly.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_expect_only.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_instance_dispatch.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_sqlite_resource_dual_mode.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_sqlite_shared_memory_behavior.py +0 -0
- {fleet_python-0.2.115 → fleet_python-0.2.116}/tests/test_verifier_from_string.py +0 -0
|
@@ -601,7 +601,6 @@ class AsyncFleet:
|
|
|
601
601
|
)
|
|
602
602
|
|
|
603
603
|
instance = AsyncEnv(client=self.client, **response.json())
|
|
604
|
-
await instance.instance.load()
|
|
605
604
|
return instance
|
|
606
605
|
|
|
607
606
|
async def make_for_task(self, task: Task) -> AsyncEnv:
|
|
@@ -653,7 +652,6 @@ class AsyncFleet:
|
|
|
653
652
|
else:
|
|
654
653
|
response = await self.client.request("GET", f"/v1/env/instances/{instance_id}")
|
|
655
654
|
instance = AsyncEnv(client=self.client, **response.json())
|
|
656
|
-
await instance.instance.load()
|
|
657
655
|
return instance
|
|
658
656
|
|
|
659
657
|
def _create_url_instance(self, base_url: str) -> AsyncEnv:
|
|
@@ -613,7 +613,6 @@ class Fleet:
|
|
|
613
613
|
)
|
|
614
614
|
|
|
615
615
|
instance = SyncEnv(client=self.client, **response.json())
|
|
616
|
-
instance.instance.load()
|
|
617
616
|
return instance
|
|
618
617
|
|
|
619
618
|
def make_for_task(self, task: Task) -> SyncEnv:
|
|
@@ -665,7 +664,6 @@ class Fleet:
|
|
|
665
664
|
else:
|
|
666
665
|
response = self.client.request("GET", f"/v1/env/instances/{instance_id}")
|
|
667
666
|
instance = SyncEnv(client=self.client, **response.json())
|
|
668
|
-
instance.instance.load()
|
|
669
667
|
return instance
|
|
670
668
|
|
|
671
669
|
def _create_url_instance(self, base_url: str) -> SyncEnv:
|
|
@@ -823,6 +823,54 @@ def _parse_grade_response(data: dict) -> JudgeResult:
|
|
|
823
823
|
return JudgeResult(score, details=data)
|
|
824
824
|
|
|
825
825
|
|
|
826
|
+
def _print_criteria_markers(criteria: list) -> None:
|
|
827
|
+
"""Emit ``>>> CRITERIA >>>`` stdout markers for structured criteria display.
|
|
828
|
+
|
|
829
|
+
The orchestrator (theseus PR #1967) scans verifier stdout for these
|
|
830
|
+
markers and wraps the execution result so the client (client PR #1737)
|
|
831
|
+
can render an expandable rubric breakdown.
|
|
832
|
+
|
|
833
|
+
Converts from the orchestrator judge-response format::
|
|
834
|
+
|
|
835
|
+
{"name": str, "score": int, "max_score": int, "reasoning": str}
|
|
836
|
+
|
|
837
|
+
to the client-expected marker format::
|
|
838
|
+
|
|
839
|
+
{"criteria": str, "score": float, "score_out_of": float, "description"?: str}
|
|
840
|
+
|
|
841
|
+
Each criterion's score is normalised to a 0.0–1.0 float using its own
|
|
842
|
+
``max_score``.
|
|
843
|
+
"""
|
|
844
|
+
marker_criteria = []
|
|
845
|
+
for c in criteria:
|
|
846
|
+
name = c.get("name", "")
|
|
847
|
+
cscore = c.get("score", 0)
|
|
848
|
+
cmax = c.get("max_score", 0)
|
|
849
|
+
|
|
850
|
+
# Normalise per-criterion score to 0.0–1.0
|
|
851
|
+
if cmax and float(cmax) > 0:
|
|
852
|
+
norm_score = float(cscore) / float(cmax)
|
|
853
|
+
else:
|
|
854
|
+
norm_score = float(cscore)
|
|
855
|
+
|
|
856
|
+
entry: dict = {
|
|
857
|
+
"criteria": name,
|
|
858
|
+
"score": round(norm_score, 4),
|
|
859
|
+
"score_out_of": 1.0,
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
reasoning = c.get("reasoning", "")
|
|
863
|
+
if reasoning:
|
|
864
|
+
entry["description"] = reasoning
|
|
865
|
+
|
|
866
|
+
marker_criteria.append(entry)
|
|
867
|
+
|
|
868
|
+
if marker_criteria:
|
|
869
|
+
print(">>> CRITERIA >>>")
|
|
870
|
+
print(json.dumps(marker_criteria))
|
|
871
|
+
print("<<< CRITERIA <<<")
|
|
872
|
+
|
|
873
|
+
|
|
826
874
|
def _print_judge_result(data: dict) -> None:
|
|
827
875
|
"""Print detailed judge grading result for verifier stdout capture."""
|
|
828
876
|
model = data.get("model_used", "unknown")
|
|
@@ -848,6 +896,12 @@ def _print_judge_result(data: dict) -> None:
|
|
|
848
896
|
if len(reasoning) > 200:
|
|
849
897
|
reasoning = reasoning[:200] + "..."
|
|
850
898
|
print(f"[C] {name}: {cscore}/{cmax} — {reasoning}")
|
|
899
|
+
|
|
900
|
+
# Emit structured criteria via stdout markers so the orchestrator
|
|
901
|
+
# (_extract_criteria_from_stdout) and client can render a rubric
|
|
902
|
+
# breakdown. Schema per element:
|
|
903
|
+
# {"criteria": str, "score": float, "score_out_of": float, "description"?: str}
|
|
904
|
+
_print_criteria_markers(criteria)
|
|
851
905
|
else:
|
|
852
906
|
print(f"[C] Score: {normalized:.2f}")
|
|
853
907
|
|
|
@@ -117,6 +117,7 @@ tests/test_app_method.py
|
|
|
117
117
|
tests/test_expect_exactly.py
|
|
118
118
|
tests/test_expect_only.py
|
|
119
119
|
tests/test_instance_dispatch.py
|
|
120
|
+
tests/test_judge_criteria_markers.py
|
|
120
121
|
tests/test_sqlite_resource_dual_mode.py
|
|
121
122
|
tests/test_sqlite_shared_memory_behavior.py
|
|
122
123
|
tests/test_verifier_from_string.py
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Tests for structured criteria stdout markers in fleet.judge.
|
|
2
|
+
|
|
3
|
+
Validates that _print_criteria_markers emits the correct
|
|
4
|
+
>>> CRITERIA >>> / <<< CRITERIA <<< markers that the orchestrator
|
|
5
|
+
(theseus PR #1967) and client (client PR #1737) expect.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
from io import StringIO
|
|
11
|
+
from unittest.mock import patch
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from fleet.judge import (
|
|
16
|
+
_print_criteria_markers,
|
|
17
|
+
_print_judge_result,
|
|
18
|
+
_parse_grade_response,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helpers
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
_MARKER_RE = re.compile(
|
|
27
|
+
r">>> CRITERIA >>>\s*\n(.*?)\n<<< CRITERIA <<<",
|
|
28
|
+
re.DOTALL,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _capture_print(fn, *args, **kwargs):
|
|
33
|
+
"""Capture all print() output from a function call."""
|
|
34
|
+
buf = StringIO()
|
|
35
|
+
with patch("builtins.print", side_effect=lambda *a, **kw: buf.write(" ".join(str(x) for x in a) + "\n")):
|
|
36
|
+
fn(*args, **kwargs)
|
|
37
|
+
return buf.getvalue()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _extract_criteria_from_stdout(stdout: str):
|
|
41
|
+
"""Mirror the orchestrator's extraction logic (theseus PR #1967)."""
|
|
42
|
+
m = _MARKER_RE.search(stdout)
|
|
43
|
+
if not m:
|
|
44
|
+
return None
|
|
45
|
+
parsed = json.loads(m.group(1).strip())
|
|
46
|
+
if isinstance(parsed, list):
|
|
47
|
+
return parsed
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# _print_criteria_markers
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
class TestPrintCriteriaMarkers:
|
|
56
|
+
"""Tests for _print_criteria_markers."""
|
|
57
|
+
|
|
58
|
+
def test_basic_criteria_output(self):
|
|
59
|
+
"""Emits valid markers with normalised scores."""
|
|
60
|
+
criteria = [
|
|
61
|
+
{"name": "Accuracy", "score": 8, "max_score": 10, "reasoning": "Good job"},
|
|
62
|
+
{"name": "Style", "score": 5, "max_score": 5, "reasoning": "Perfect"},
|
|
63
|
+
]
|
|
64
|
+
stdout = _capture_print(_print_criteria_markers, criteria)
|
|
65
|
+
|
|
66
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
67
|
+
assert parsed is not None, f"Markers not found in stdout:\n{stdout}"
|
|
68
|
+
assert len(parsed) == 2
|
|
69
|
+
|
|
70
|
+
assert parsed[0]["criteria"] == "Accuracy"
|
|
71
|
+
assert parsed[0]["score"] == pytest.approx(0.8, abs=0.01)
|
|
72
|
+
assert parsed[0]["score_out_of"] == 1.0
|
|
73
|
+
assert parsed[0]["description"] == "Good job"
|
|
74
|
+
|
|
75
|
+
assert parsed[1]["criteria"] == "Style"
|
|
76
|
+
assert parsed[1]["score"] == pytest.approx(1.0, abs=0.01)
|
|
77
|
+
assert parsed[1]["score_out_of"] == 1.0
|
|
78
|
+
|
|
79
|
+
def test_zero_max_score_passthrough(self):
|
|
80
|
+
"""When max_score is 0, raw score passes through."""
|
|
81
|
+
criteria = [
|
|
82
|
+
{"name": "Metric", "score": 0.75, "max_score": 0},
|
|
83
|
+
]
|
|
84
|
+
stdout = _capture_print(_print_criteria_markers, criteria)
|
|
85
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
86
|
+
assert parsed is not None
|
|
87
|
+
assert parsed[0]["score"] == pytest.approx(0.75, abs=0.01)
|
|
88
|
+
|
|
89
|
+
def test_empty_criteria_no_markers(self):
|
|
90
|
+
"""Empty list should produce no markers."""
|
|
91
|
+
stdout = _capture_print(_print_criteria_markers, [])
|
|
92
|
+
assert ">>> CRITERIA >>>" not in stdout
|
|
93
|
+
|
|
94
|
+
def test_reasoning_maps_to_description(self):
|
|
95
|
+
"""The 'reasoning' field maps to 'description' in the marker schema."""
|
|
96
|
+
criteria = [
|
|
97
|
+
{"name": "Test", "score": 3, "max_score": 5, "reasoning": "Some reasoning here"},
|
|
98
|
+
]
|
|
99
|
+
stdout = _capture_print(_print_criteria_markers, criteria)
|
|
100
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
101
|
+
assert parsed[0]["description"] == "Some reasoning here"
|
|
102
|
+
|
|
103
|
+
def test_missing_reasoning_no_description(self):
|
|
104
|
+
"""When reasoning is empty, description key should be absent."""
|
|
105
|
+
criteria = [
|
|
106
|
+
{"name": "Test", "score": 3, "max_score": 5, "reasoning": ""},
|
|
107
|
+
]
|
|
108
|
+
stdout = _capture_print(_print_criteria_markers, criteria)
|
|
109
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
110
|
+
assert "description" not in parsed[0]
|
|
111
|
+
|
|
112
|
+
def test_output_parseable_by_orchestrator_regex(self):
|
|
113
|
+
"""Ensure the output matches the exact regex the orchestrator uses."""
|
|
114
|
+
criteria = [
|
|
115
|
+
{"name": "A", "score": 1, "max_score": 2, "reasoning": "half"},
|
|
116
|
+
]
|
|
117
|
+
stdout = _capture_print(_print_criteria_markers, criteria)
|
|
118
|
+
|
|
119
|
+
# Use the exact regex from theseus PR #1967
|
|
120
|
+
m = re.search(
|
|
121
|
+
r">>> CRITERIA >>>\s*\n(.*?)\n<<< CRITERIA <<<",
|
|
122
|
+
stdout,
|
|
123
|
+
re.DOTALL,
|
|
124
|
+
)
|
|
125
|
+
assert m is not None, "Output doesn't match orchestrator regex"
|
|
126
|
+
data = json.loads(m.group(1).strip())
|
|
127
|
+
assert isinstance(data, list)
|
|
128
|
+
assert data[0]["criteria"] == "A"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# _print_judge_result integration
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
class TestPrintJudgeResult:
|
|
136
|
+
"""Tests for _print_judge_result emitting criteria markers."""
|
|
137
|
+
|
|
138
|
+
def test_criteria_markers_emitted(self):
|
|
139
|
+
"""_print_judge_result emits criteria markers when criteria present."""
|
|
140
|
+
data = {
|
|
141
|
+
"model_used": "claude-sonnet",
|
|
142
|
+
"provider_used": "anthropic",
|
|
143
|
+
"total_score": 15,
|
|
144
|
+
"max_score": 20,
|
|
145
|
+
"normalized_score": 0.75,
|
|
146
|
+
"criteria": [
|
|
147
|
+
{"name": "Accuracy", "score": 8, "max_score": 10, "reasoning": "Good"},
|
|
148
|
+
{"name": "Style", "score": 7, "max_score": 10, "reasoning": "Decent"},
|
|
149
|
+
],
|
|
150
|
+
}
|
|
151
|
+
stdout = _capture_print(_print_judge_result, data)
|
|
152
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
153
|
+
assert parsed is not None
|
|
154
|
+
assert len(parsed) == 2
|
|
155
|
+
|
|
156
|
+
def test_no_criteria_no_markers(self):
|
|
157
|
+
"""_print_judge_result doesn't emit markers when no criteria."""
|
|
158
|
+
data = {
|
|
159
|
+
"model_used": "claude-sonnet",
|
|
160
|
+
"provider_used": "anthropic",
|
|
161
|
+
"total_score": 0,
|
|
162
|
+
"max_score": 0,
|
|
163
|
+
"normalized_score": 0.5,
|
|
164
|
+
}
|
|
165
|
+
stdout = _capture_print(_print_judge_result, data)
|
|
166
|
+
assert ">>> CRITERIA >>>" not in stdout
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
# _parse_grade_response integration
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
class TestParseGradeResponse:
|
|
174
|
+
"""Tests for _parse_grade_response emitting criteria markers."""
|
|
175
|
+
|
|
176
|
+
def test_full_flow_emits_markers(self):
|
|
177
|
+
"""_parse_grade_response → _print_judge_result → criteria markers."""
|
|
178
|
+
data = {
|
|
179
|
+
"model_used": "claude-sonnet",
|
|
180
|
+
"provider_used": "anthropic",
|
|
181
|
+
"total_score": 9,
|
|
182
|
+
"max_score": 10,
|
|
183
|
+
"normalized_score": 0.9,
|
|
184
|
+
"criteria": [
|
|
185
|
+
{"name": "Completeness", "score": 9, "max_score": 10, "reasoning": "Almost perfect"},
|
|
186
|
+
],
|
|
187
|
+
}
|
|
188
|
+
stdout = _capture_print(_parse_grade_response, data)
|
|
189
|
+
parsed = _extract_criteria_from_stdout(stdout)
|
|
190
|
+
assert parsed is not None
|
|
191
|
+
assert parsed[0]["criteria"] == "Completeness"
|
|
192
|
+
assert parsed[0]["score"] == pytest.approx(0.9, abs=0.01)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|