synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (48) hide show
  1. examples/agora_ex/README_MoE.md +224 -0
  2. examples/agora_ex/__init__.py +7 -0
  3. examples/agora_ex/agora_ex.py +65 -0
  4. examples/agora_ex/agora_ex_task_app.py +590 -0
  5. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
  6. examples/agora_ex/reward_fn_grpo-human.py +129 -0
  7. examples/agora_ex/system_prompt_CURRENT.md +63 -0
  8. examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
  9. examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
  10. examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
  11. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  12. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
  13. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  14. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  15. examples/multi_step/crafter_rl_lora.md +51 -10
  16. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  17. examples/multi_step/task_app_config_notes.md +7 -1
  18. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
  19. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
  20. examples/warming_up_to_rl/run_eval.py +127 -18
  21. examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
  22. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  23. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
  24. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +232 -193
  25. synth_ai/__init__.py +41 -1
  26. synth_ai/api/train/builders.py +49 -19
  27. synth_ai/api/train/configs/__init__.py +44 -0
  28. synth_ai/api/train/configs/rl.py +133 -0
  29. synth_ai/api/train/configs/sft.py +94 -0
  30. synth_ai/api/train/configs/shared.py +24 -0
  31. synth_ai/cli/demo.py +38 -39
  32. synth_ai/cli/rl_demo.py +81 -102
  33. synth_ai/cli/task_apps.py +3 -0
  34. synth_ai/demos/core/cli.py +121 -159
  35. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  36. synth_ai/evals/__init__.py +15 -0
  37. synth_ai/evals/client.py +85 -0
  38. synth_ai/evals/types.py +42 -0
  39. synth_ai/judge_schemas.py +127 -0
  40. synth_ai/rubrics/__init__.py +22 -0
  41. synth_ai/rubrics/validators.py +126 -0
  42. synth_ai/tracing_v3/serialization.py +130 -0
  43. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +1 -1
  44. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +48 -22
  45. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
  46. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
  47. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
  48. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import argparse
4
3
  import contextlib
5
4
  import json
6
5
  import os
@@ -45,7 +44,7 @@ def _is_modal_public_url(u: str) -> bool:
45
44
  return False
46
45
 
47
46
 
48
- def cmd_setup(_args: argparse.Namespace) -> int:
47
+ def setup() -> int:
49
48
  # Change to demo directory if stored
50
49
  demo_dir = demo_core.load_demo_dir()
51
50
  if demo_dir and os.path.isdir(demo_dir):
@@ -760,7 +759,9 @@ def _ensure_task_app_ready(env: DemoEnv, synth_key: str, *, label: str) -> DemoE
760
759
  return updated_env
761
760
 
762
761
 
763
- def cmd_deploy(args: argparse.Namespace) -> int:
762
+ def deploy(
763
+ local: bool = False, app: str | None = None, name: str | None = None, script: str | None = None
764
+ ) -> int:
764
765
  # Change to demo directory if stored
765
766
  demo_dir = demo_core.load_demo_dir()
766
767
  if demo_dir and os.path.isdir(demo_dir):
@@ -774,7 +775,7 @@ def cmd_deploy(args: argparse.Namespace) -> int:
774
775
  url = ""
775
776
  app_name = env.task_app_name or ""
776
777
  try:
777
- if args.local:
778
+ if local:
778
779
  print("Starting local Task App…")
779
780
  import subprocess
780
781
 
@@ -798,7 +799,7 @@ def cmd_deploy(args: argparse.Namespace) -> int:
798
799
  time.sleep(1)
799
800
  else:
800
801
  # Auto-detect app path if not supplied; prompt interactively from discovered ASGI apps
801
- app_path = os.path.abspath(args.app) if args.app else None
802
+ app_path = os.path.abspath(app) if app else None
802
803
  if not app_path or not os.path.isfile(app_path):
803
804
  # First pass: look for known common filenames
804
805
  candidates = [
@@ -828,13 +829,13 @@ def cmd_deploy(args: argparse.Namespace) -> int:
828
829
  choice = 1
829
830
  choice = max(1, min(choice, len(found)))
830
831
  app_path = str(found[choice - 1].resolve())
831
- if not app_path and args.script:
832
+ if not app_path and script:
832
833
  # Legacy script fallback if user supplied --script explicitly
833
834
  from synth_ai.demos.demo_task_apps.math.deploy_modal import deploy as modal_deploy
834
835
 
835
- url = modal_deploy(script_path=args.script, env_api_key=env.env_api_key)
836
- if args.name:
837
- app_name = args.name
836
+ url = modal_deploy(script_path=script, env_api_key=env.env_api_key)
837
+ if name:
838
+ app_name = name
838
839
  else:
839
840
  if not app_path:
840
841
  entered = input("Path to Modal app.py (e.g., ./task_app.py): ").strip()
@@ -845,7 +846,7 @@ def cmd_deploy(args: argparse.Namespace) -> int:
845
846
  raise FileNotFoundError(f"App file not found: {app_path}")
846
847
  # Surface the app path before asking for the name
847
848
  print(f"Using task app: {app_path}")
848
- existing_name = (args.name or env.task_app_name or "").strip()
849
+ existing_name = (name or env.task_app_name or "").strip()
849
850
  if not existing_name:
850
851
  existing_name = f"synth-{os.path.splitext(os.path.basename(app_path))[0]}"
851
852
  suggested_name = existing_name
@@ -1128,7 +1129,7 @@ def _ensure_modal_installed() -> None:
1128
1129
  print("\n You can deploy later after authenticating.\n")
1129
1130
 
1130
1131
 
1131
- def cmd_init(args: argparse.Namespace) -> int:
1132
+ def init(template: str | None = None, dest: str | None = None, force: bool = False) -> int:
1132
1133
  """Materialise a demo task app template into the current directory."""
1133
1134
 
1134
1135
  templates = list(list_demo_templates())
@@ -1137,37 +1138,44 @@ def cmd_init(args: argparse.Namespace) -> int:
1137
1138
  return 1
1138
1139
 
1139
1140
  selected: DemoTemplate | None = None
1140
- if args.template:
1141
- selected = get_demo_template(args.template)
1141
+ if template:
1142
+ selected = get_demo_template(template)
1142
1143
  if selected is None:
1143
1144
  available = ", ".join(t.template_id for t in templates)
1144
- print(f"Unknown template '{args.template}'. Available: {available}")
1145
+ print(f"Unknown template '{template}'. Available: {available}")
1145
1146
  return 1
1146
1147
  else:
1147
- print("Select a demo template:" + "\n")
1148
- for idx, template in enumerate(templates, start=1):
1149
- print(f" [{idx}] {template.name} ({template.template_id})")
1150
- print(f" {template.description}")
1151
- try:
1152
- choice_raw = input(f"Enter choice [1-{len(templates)}] (default 1): ").strip() or "1"
1153
- except Exception:
1154
- choice_raw = "1"
1155
- if not choice_raw.isdigit():
1156
- print("Selection must be a number.")
1157
- return 1
1158
- choice_idx = int(choice_raw)
1159
- if not 1 <= choice_idx <= len(templates):
1160
- print("Selection out of range.")
1161
- return 1
1162
- selected = templates[choice_idx - 1]
1148
+ if force:
1149
+ selected = templates[0]
1150
+ print(
1151
+ f"Using default template: {selected.name} ({selected.template_id}) "
1152
+ f"(pass --template to choose another)"
1153
+ )
1154
+ else:
1155
+ print("Select a demo template:" + "\n")
1156
+ for idx, tpl in enumerate(templates, start=1):
1157
+ print(f" [{idx}] {tpl.name} ({tpl.template_id})")
1158
+ print(f" {tpl.description}")
1159
+ try:
1160
+ choice_raw = input(f"Enter choice [1-{len(templates)}] (default 1): ").strip() or "1"
1161
+ except Exception:
1162
+ choice_raw = "1"
1163
+ if not choice_raw.isdigit():
1164
+ print("Selection must be a number.")
1165
+ return 1
1166
+ choice_idx = int(choice_raw)
1167
+ if not 1 <= choice_idx <= len(templates):
1168
+ print("Selection out of range.")
1169
+ return 1
1170
+ selected = templates[choice_idx - 1]
1163
1171
 
1164
1172
  assert selected is not None
1165
1173
 
1166
1174
  default_subdir = selected.default_subdir or selected.template_id
1167
1175
 
1168
1176
  # Check if default destination is already occupied and switch to local_demos/ if needed
1169
- if args.dest:
1170
- default_dest = Path(args.dest).expanduser().resolve()
1177
+ if dest:
1178
+ default_dest = Path(dest).expanduser().resolve()
1171
1179
  else:
1172
1180
  primary_dest = Path.cwd() / default_subdir
1173
1181
  if primary_dest.exists() and any(primary_dest.iterdir()):
@@ -1176,10 +1184,13 @@ def cmd_init(args: argparse.Namespace) -> int:
1176
1184
  else:
1177
1185
  default_dest = primary_dest.resolve()
1178
1186
 
1179
- try:
1180
- dest_input = input(f"Destination directory [{default_dest}]: ").strip()
1181
- except Exception:
1187
+ if force:
1182
1188
  dest_input = ""
1189
+ else:
1190
+ try:
1191
+ dest_input = input(f"Destination directory [{default_dest}]: ").strip()
1192
+ except Exception:
1193
+ dest_input = ""
1183
1194
  destination = Path(dest_input).expanduser().resolve() if dest_input else default_dest
1184
1195
 
1185
1196
  # Track whether we should skip individual file prompts (if we already cleared the directory)
@@ -1190,15 +1201,18 @@ def cmd_init(args: argparse.Namespace) -> int:
1190
1201
  print(f"Destination {destination} is a file. Provide a directory path.")
1191
1202
  return 1
1192
1203
  if any(destination.iterdir()):
1193
- try:
1194
- response = (
1195
- input(f"Destination {destination} is not empty. Overwrite? [y/N]: ")
1196
- .strip()
1197
- .lower()
1198
- )
1199
- except (EOFError, KeyboardInterrupt):
1200
- print("\nCancelled.")
1201
- return 1
1204
+ if force:
1205
+ response = "y"
1206
+ else:
1207
+ try:
1208
+ response = (
1209
+ input(f"Destination {destination} is not empty. Overwrite? [y/N]: ")
1210
+ .strip()
1211
+ .lower()
1212
+ )
1213
+ except (EOFError, KeyboardInterrupt):
1214
+ print("\nCancelled.")
1215
+ return 1
1202
1216
  if response not in ("y", "yes"):
1203
1217
  print("Cancelled. Choose another directory or delete the existing one.")
1204
1218
  return 1
@@ -1236,15 +1250,18 @@ def cmd_init(args: argparse.Namespace) -> int:
1236
1250
  # Handle directory copying
1237
1251
  if src_path.is_dir():
1238
1252
  if dest_path.exists() and not directory_cleared:
1239
- try:
1240
- response = (
1241
- input(f"Directory {dest_path.name} exists. Overwrite? [y/N]: ")
1242
- .strip()
1243
- .lower()
1244
- )
1245
- except (EOFError, KeyboardInterrupt):
1246
- print("\nCancelled.")
1247
- return 1
1253
+ if force:
1254
+ response = "y"
1255
+ else:
1256
+ try:
1257
+ response = (
1258
+ input(f"Directory {dest_path.name} exists. Overwrite? [y/N]: ")
1259
+ .strip()
1260
+ .lower()
1261
+ )
1262
+ except (EOFError, KeyboardInterrupt):
1263
+ print("\nCancelled.")
1264
+ return 1
1248
1265
  if response not in ("y", "yes"):
1249
1266
  print(f"Skipping {dest_path.name}")
1250
1267
  continue
@@ -1256,15 +1273,18 @@ def cmd_init(args: argparse.Namespace) -> int:
1256
1273
  # Handle file copying
1257
1274
  dest_path.parent.mkdir(parents=True, exist_ok=True)
1258
1275
  if dest_path.exists() and not directory_cleared:
1259
- try:
1260
- response = (
1261
- input(f"File {dest_path.name} exists. Overwrite? [y/N]: ")
1262
- .strip()
1263
- .lower()
1264
- )
1265
- except (EOFError, KeyboardInterrupt):
1266
- print("\nCancelled.")
1267
- return 1
1276
+ if force:
1277
+ response = "y"
1278
+ else:
1279
+ try:
1280
+ response = (
1281
+ input(f"File {dest_path.name} exists. Overwrite? [y/N]: ")
1282
+ .strip()
1283
+ .lower()
1284
+ )
1285
+ except (EOFError, KeyboardInterrupt):
1286
+ print("\nCancelled.")
1287
+ return 1
1268
1288
  if response not in ("y", "yes"):
1269
1289
  print(f"Skipping {dest_path.name}")
1270
1290
  continue
@@ -1280,11 +1300,14 @@ def cmd_init(args: argparse.Namespace) -> int:
1280
1300
  env_path = destination / ".env"
1281
1301
  should_write = True
1282
1302
  if env_path.exists() and not directory_cleared:
1283
- try:
1284
- response = input("File .env exists. Overwrite? [y/N]: ").strip().lower()
1285
- except (EOFError, KeyboardInterrupt):
1286
- print("\nCancelled.")
1287
- return 1
1303
+ if force:
1304
+ response = "y"
1305
+ else:
1306
+ try:
1307
+ response = input("File .env exists. Overwrite? [y/N]: ").strip().lower()
1308
+ except (EOFError, KeyboardInterrupt):
1309
+ print("\nCancelled.")
1310
+ return 1
1288
1311
  should_write = response in ("y", "yes")
1289
1312
  if should_write:
1290
1313
  _write_text(env_path, "\n".join(selected.env_lines) + "\n")
@@ -1296,13 +1319,16 @@ def cmd_init(args: argparse.Namespace) -> int:
1296
1319
  cfg_dst = (destination / selected.config_destination).resolve()
1297
1320
  should_copy = True
1298
1321
  if cfg_dst.exists() and not directory_cleared:
1299
- try:
1300
- response = (
1301
- input(f"File {cfg_dst.name} exists. Overwrite? [y/N]: ").strip().lower()
1302
- )
1303
- except (EOFError, KeyboardInterrupt):
1304
- print("\nCancelled.")
1305
- return 1
1322
+ if force:
1323
+ response = "y"
1324
+ else:
1325
+ try:
1326
+ response = (
1327
+ input(f"File {cfg_dst.name} exists. Overwrite? [y/N]: ").strip().lower()
1328
+ )
1329
+ except (EOFError, KeyboardInterrupt):
1330
+ print("\nCancelled.")
1331
+ return 1
1306
1332
  should_copy = response in ("y", "yes")
1307
1333
  if should_copy:
1308
1334
  cfg_dst.parent.mkdir(parents=True, exist_ok=True)
@@ -1388,7 +1414,14 @@ def _write_text(path: str, content: str) -> None:
1388
1414
  # Note: `prepare` command has been removed; configuration now prepares TOML
1389
1415
 
1390
1416
 
1391
- def cmd_run(args: argparse.Namespace) -> int:
1417
+ def run(
1418
+ config: str | None = None,
1419
+ batch_size: int | None = None,
1420
+ group_size: int | None = None,
1421
+ model: str | None = None,
1422
+ timeout: int = 600,
1423
+ dry_run: bool = False,
1424
+ ) -> int:
1392
1425
  # Change to demo directory if stored
1393
1426
  demo_dir = demo_core.load_demo_dir()
1394
1427
  if demo_dir and os.path.isdir(demo_dir):
@@ -1429,7 +1462,7 @@ def cmd_run(args: argparse.Namespace) -> int:
1429
1462
  import tomllib
1430
1463
 
1431
1464
  try:
1432
- cfg_path = _select_or_create_config(getattr(args, "config", None), env)
1465
+ cfg_path = _select_or_create_config(config, env)
1433
1466
  except FileNotFoundError as exc:
1434
1467
  print(exc)
1435
1468
  return 1
@@ -1451,12 +1484,12 @@ def cmd_run(args: argparse.Namespace) -> int:
1451
1484
  # Optional: TRAINER_START_URL passthrough if already set in environment
1452
1485
  run_env["TRAINER_START_URL"] = run_env.get("TRAINER_START_URL", "")
1453
1486
  # Forward convenience knobs
1454
- if args.batch_size is not None:
1455
- run_env["RL_BATCH_SIZE"] = str(int(args.batch_size))
1456
- if args.group_size is not None:
1457
- run_env["RL_GROUP_SIZE"] = str(int(args.group_size))
1458
- if args.model:
1459
- run_env["RL_MODEL"] = args.model
1487
+ if batch_size is not None:
1488
+ run_env["RL_BATCH_SIZE"] = str(int(batch_size))
1489
+ if group_size is not None:
1490
+ run_env["RL_GROUP_SIZE"] = str(int(group_size))
1491
+ if model:
1492
+ run_env["RL_MODEL"] = model
1460
1493
  cmd = ["uv", "run", "python", launcher]
1461
1494
  print(f"Launching monorepo clustered runner: {' '.join(cmd)}")
1462
1495
  code = _popen_stream(cmd, env=run_env)
@@ -1484,11 +1517,11 @@ def cmd_run(args: argparse.Namespace) -> int:
1484
1517
  inline_cfg = tomllib.load(fh)
1485
1518
  with open(cfg_path) as fh2:
1486
1519
  toml_text = fh2.read()
1487
- if args.batch_size is not None:
1488
- inline_cfg.setdefault("training", {})["batch_size"] = int(args.batch_size)
1489
- if args.group_size is not None:
1490
- inline_cfg.setdefault("training", {})["group_size"] = int(args.group_size)
1491
- model_name = args.model or (inline_cfg.get("model", {}) or {}).get("name", "Qwen/Qwen3-0.6B")
1520
+ if batch_size is not None:
1521
+ inline_cfg.setdefault("training", {})["batch_size"] = int(batch_size)
1522
+ if group_size is not None:
1523
+ inline_cfg.setdefault("training", {})["group_size"] = int(group_size)
1524
+ model_name = model or (inline_cfg.get("model", {}) or {}).get("name", "Qwen/Qwen3-0.6B")
1492
1525
  api = env.dev_backend_url.rstrip("/") + ("" if env.dev_backend_url.endswith("/api") else "/api")
1493
1526
  # Print backend and key preview before request for clearer diagnostics
1494
1527
  try:
@@ -1678,79 +1711,8 @@ def cmd_run(args: argparse.Namespace) -> int:
1678
1711
  if name == "eval.reward_mean":
1679
1712
  print(f"metric eval.reward_mean step={p.get('step')} value={p.get('value')}")
1680
1713
  break
1681
- if time.time() - start_t > (args.timeout or 600):
1714
+ if time.time() - start_t > (timeout or 600):
1682
1715
  print("Timeout waiting for terminal state.")
1683
1716
  break
1684
1717
  time.sleep(2)
1685
1718
  return 0
1686
-
1687
-
1688
- def main(argv: list[str] | None = None) -> int:
1689
- p = argparse.ArgumentParser(prog="synth-ai")
1690
- sub = p.add_subparsers(dest="cmd")
1691
-
1692
- def _add_parser(
1693
- names: list[str], *, configure: Callable[[argparse.ArgumentParser], None]
1694
- ) -> None:
1695
- for name in names:
1696
- parser = sub.add_parser(name)
1697
- configure(parser)
1698
-
1699
- _add_parser(
1700
- ["rl_demo.setup", "demo.setup"],
1701
- configure=lambda parser: parser.set_defaults(func=cmd_setup),
1702
- )
1703
-
1704
- def _init_opts(parser):
1705
- parser.add_argument("--template", type=str, default=None, help="Template id to instantiate")
1706
- parser.add_argument(
1707
- "--dest", type=str, default=None, help="Destination directory for files"
1708
- )
1709
- parser.set_defaults(func=cmd_init)
1710
-
1711
- _add_parser(["rl_demo.init", "demo.init"], configure=_init_opts)
1712
-
1713
- # (prepare command removed)
1714
-
1715
- def _deploy_opts(parser):
1716
- parser.add_argument(
1717
- "--local", action="store_true", help="Run local FastAPI instead of Modal deploy"
1718
- )
1719
- parser.add_argument(
1720
- "--app", type=str, default=None, help="Path to Modal app.py for uv run modal deploy"
1721
- )
1722
- parser.add_argument("--name", type=str, default=None, help="Modal app name")
1723
- parser.add_argument(
1724
- "--script", type=str, default=None, help="Path to deploy_task_app.sh (optional legacy)"
1725
- )
1726
- parser.set_defaults(func=cmd_deploy)
1727
-
1728
- _add_parser(["rl_demo.deploy", "demo.deploy"], configure=_deploy_opts)
1729
-
1730
- _add_parser(
1731
- ["rl_demo.configure", "demo.configure"],
1732
- configure=lambda parser: parser.set_defaults(func=cmd_run),
1733
- )
1734
-
1735
- def _run_opts(parser):
1736
- parser.add_argument(
1737
- "--config", type=str, default=None, help="Path to TOML config (skip prompt)"
1738
- )
1739
- parser.add_argument("--batch-size", type=int, default=None)
1740
- parser.add_argument("--group-size", type=int, default=None)
1741
- parser.add_argument("--model", type=str, default=None)
1742
- parser.add_argument("--timeout", type=int, default=600)
1743
- parser.add_argument("--dry-run", action="store_true", help="Print request body and exit")
1744
- parser.set_defaults(func=cmd_run)
1745
-
1746
- _add_parser(["run", "rl_demo.run", "demo.run"], configure=_run_opts)
1747
-
1748
- args = p.parse_args(argv)
1749
- if not hasattr(args, "func"):
1750
- p.print_help()
1751
- return 1
1752
- return int(args.func(args) or 0)
1753
-
1754
-
1755
- if __name__ == "__main__":
1756
- sys.exit(main())
@@ -190,6 +190,22 @@ class SynthCrafterObservationCallable(GetObservationCallable):
190
190
  obs_dict["truncated"] = priv.truncated
191
191
  if pub.error_info:
192
192
  obs_dict["tool_error"] = pub.error_info
193
+ counts_payload = {}
194
+ try:
195
+ counts = getattr(priv, "achievements_current_values", {}) or {}
196
+ for k, v in counts.items():
197
+ try:
198
+ counts_payload[str(k)] = int(v)
199
+ except Exception:
200
+ try:
201
+ counts_payload[str(k)] = int(float(v))
202
+ except Exception:
203
+ continue
204
+ if counts_payload:
205
+ obs_dict["achievements_counts"] = counts_payload
206
+ except Exception:
207
+ # Best effort; omit counts if coercion fails
208
+ pass
193
209
 
194
210
  # Derive a simple local semantic patch around the player for easy rendering
195
211
  try:
@@ -0,0 +1,15 @@
1
+ from .client import JudgeClient, JudgeOptions, JudgeScoreResponse
2
+ from .types import Judgement, RewardJudgement, RewardMetadata, Track, TrackAggregate
3
+
4
+ __all__ = [
5
+ "JudgeClient",
6
+ "JudgeOptions",
7
+ "JudgeScoreResponse",
8
+ "Judgement",
9
+ "RewardJudgement",
10
+ "RewardMetadata",
11
+ "Track",
12
+ "TrackAggregate",
13
+ ]
14
+
15
+
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ """Experimental Judge API client.
4
+
5
+ This surface is experimental and subject to change without notice.
6
+ Set environment variable `SYNTH_SILENCE_EXPERIMENTAL=1` to silence warnings.
7
+ """
8
+
9
+ import os
10
+ import warnings
11
+ from typing import Any, Literal, TypedDict
12
+
13
+ from synth_ai.http import AsyncHttpClient, HTTPError
14
+ from synth_ai.tracing_v3.serialization import normalize_for_json
15
+
16
+
17
+ Provider = Literal["groq", "gemini"]
18
+
19
+
20
+ class JudgeOptions(TypedDict, total=False):
21
+ event: bool
22
+ outcome: bool
23
+ rubric_id: str
24
+ rubric_overrides: dict[str, Any]
25
+ provider: Provider
26
+ model: str
27
+ max_concurrency: int
28
+
29
+
30
+ class JudgeScoreResponse(TypedDict, total=False):
31
+ status: str
32
+ event_rewards: list[dict[str, Any]]
33
+ outcome_reward: dict[str, Any]
34
+ details: dict[str, Any]
35
+
36
+
37
+ class JudgeClient:
38
+ def __init__(self, base_url: str, api_key: str, *, timeout: float = 60.0) -> None:
39
+ _silence = (os.getenv("SYNTH_SILENCE_EXPERIMENTAL") or "").strip().lower()
40
+ if _silence not in {"1", "true", "t", "yes", "y", "on"}:
41
+ warnings.warn(
42
+ "Experimental API: synth_ai.evals.JudgeClient is experimental and may change without notice.",
43
+ UserWarning,
44
+ stacklevel=2,
45
+ )
46
+ self._base = base_url.rstrip("/")
47
+ self._key = api_key
48
+ self._timeout = timeout
49
+
50
+ async def score(
51
+ self,
52
+ *,
53
+ trace: dict[str, Any] | Any,
54
+ policy_name: str,
55
+ task_app_id: str,
56
+ options: JudgeOptions,
57
+ task_app_base_url: str | None = None,
58
+ ) -> JudgeScoreResponse:
59
+ body = {
60
+ "policy_name": policy_name,
61
+ "task_app": {"id": task_app_id, **({"base_url": task_app_base_url} if task_app_base_url else {})},
62
+ "trace": normalize_for_json(trace),
63
+ "options": options or {},
64
+ }
65
+ try:
66
+ async with AsyncHttpClient(self._base, self._key, timeout=self._timeout) as http:
67
+ js = await http.post_json("/api/judge/v1/score", json=body)
68
+ if not isinstance(js, dict):
69
+ raise ValueError("invalid_judge_response_shape")
70
+ return js # type: ignore[return-value]
71
+ except HTTPError as e: # map to friendlier exceptions
72
+ status = int(getattr(e, "status", 0) or 0)
73
+ if status in (400, 422):
74
+ raise ValueError(f"judge_validation_error: {e.detail}") from e
75
+ if status in (401, 403):
76
+ raise PermissionError(f"judge_auth_error: {e.detail}") from e
77
+ if status == 404:
78
+ raise FileNotFoundError(f"judge_route_not_found: {e.detail}") from e
79
+ if status == 429:
80
+ raise Exception("judge_rate_limited") from e # replace with RetryLater in future
81
+ if status >= 500:
82
+ raise Exception("judge_transient_error") from e # replace with TransientError in future
83
+ raise
84
+
85
+
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal, TypedDict
4
+
5
+ Track = Literal["process", "reasoning", "progress", "outcome"]
6
+
7
+
8
+ class Judgement(TypedDict, total=False):
9
+ key: str
10
+ title: str
11
+ description: str
12
+ score: float
13
+ reason: str
14
+ confidence: float
15
+ scale: Literal["binary", "bounded", "count", "custom"]
16
+ source: dict
17
+
18
+
19
+ class RewardJudgement(TypedDict, total=False):
20
+ judgement: Judgement
21
+ scope: Literal["step", "event", "outcome"]
22
+ turn: int | None
23
+ episode_id: str | None
24
+ reward_value: float | None
25
+ links: dict
26
+
27
+
28
+ class TrackAggregate(TypedDict, total=False):
29
+ mean: float
30
+ median: float
31
+ std: float
32
+ n: int
33
+
34
+
35
+ class RewardMetadata(TypedDict, total=False):
36
+ per_window: list[RewardJudgement]
37
+ aggregates: dict[Track, TrackAggregate]
38
+ overall: dict[str, float] # {"final_outcome_score": float}
39
+ rubric: dict # {"ids": {...}, "hash": "..."}
40
+ model_info: dict # {"model": "...", ...}
41
+
42
+