synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +6 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -38
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +288 -39
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
- synth_ai/api/train/builders.py +99 -4
- synth_ai/api/train/cli.py +516 -26
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +23 -2
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +61 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/auth/credentials.py +119 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +94 -18
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +30 -158
- synth_ai/cli/deploy/__init__.py +43 -0
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +51 -1480
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -10
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +518 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +45 -9
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +40 -33
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +285 -3
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""Core dataclasses for baseline configuration and results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaselineTaskRunner:
|
|
11
|
+
"""
|
|
12
|
+
Base class for task runners.
|
|
13
|
+
|
|
14
|
+
Subclasses should implement `run_task` method for class-based approach,
|
|
15
|
+
or you can use standalone async functions for function-based approach.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
policy_config: Dict[str, Any],
|
|
21
|
+
env_config: Dict[str, Any],
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize task runner with configuration.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
policy_config: Policy configuration (model, temperature, etc.)
|
|
28
|
+
env_config: Environment configuration (max_steps, difficulty, etc.)
|
|
29
|
+
"""
|
|
30
|
+
self.policy_config = policy_config
|
|
31
|
+
self.env_config = env_config
|
|
32
|
+
|
|
33
|
+
async def run_task(self, seed: int) -> TaskResult:
|
|
34
|
+
"""
|
|
35
|
+
Execute a single task instance.
|
|
36
|
+
|
|
37
|
+
This method is called for each seed in the selected split.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
seed: The seed/index for this task instance
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
TaskResult: Structured result containing success, rewards, metadata, trace
|
|
44
|
+
"""
|
|
45
|
+
raise NotImplementedError("Subclasses must implement run_task method")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class DataSplit:
|
|
50
|
+
"""Definition of a data split (train/val/test)."""
|
|
51
|
+
|
|
52
|
+
name: str # "train", "val", "test"
|
|
53
|
+
seeds: List[int] # Seed/index values for this split
|
|
54
|
+
metadata: Dict[str, Any] = field(default_factory=dict) # Optional metadata
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class TaskResult:
|
|
59
|
+
"""Result from a single task execution."""
|
|
60
|
+
|
|
61
|
+
# Required: Seed/index that was evaluated
|
|
62
|
+
seed: int
|
|
63
|
+
|
|
64
|
+
# Required: Did the task complete successfully?
|
|
65
|
+
success: bool
|
|
66
|
+
|
|
67
|
+
# Required: Outcome reward for the episode
|
|
68
|
+
outcome_reward: float
|
|
69
|
+
|
|
70
|
+
# Optional: Event rewards (step-level)
|
|
71
|
+
event_rewards: List[Dict[str, Any]] = field(default_factory=list)
|
|
72
|
+
|
|
73
|
+
# Optional: Total steps/turns taken
|
|
74
|
+
total_steps: int = 0
|
|
75
|
+
|
|
76
|
+
# Optional: Metadata (achievements, completion info, etc.)
|
|
77
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
# Optional: Error information if success=False
|
|
80
|
+
error: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
# Optional: v3 trace (SessionTrace dict)
|
|
83
|
+
trace: Optional[Dict[str, Any]] = None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Type alias for task runner (can be class or function)
|
|
87
|
+
TaskRunnerType = (
|
|
88
|
+
type[BaselineTaskRunner]
|
|
89
|
+
| Callable[[int, dict[str, Any], dict[str, Any]], Any] # Function signature
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Type alias for result aggregator (can be class or function)
|
|
93
|
+
AggregatorType = (
|
|
94
|
+
type[Any] # Class with aggregate() method
|
|
95
|
+
| Callable[[list[TaskResult]], dict[str, Any]] # Function signature
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class BaselineConfig:
|
|
101
|
+
"""Configuration for a baseline file.
|
|
102
|
+
|
|
103
|
+
A baseline file defines how to evaluate a task without requiring
|
|
104
|
+
a deployed task app. It provides self-contained evaluation logic
|
|
105
|
+
with first-class support for train/val/test splits.
|
|
106
|
+
|
|
107
|
+
Supports both class-based and function-based task runners:
|
|
108
|
+
- Class-based: Pass a class that inherits from BaselineTaskRunner
|
|
109
|
+
- Function-based: Pass an async function with signature:
|
|
110
|
+
async def task_runner(seed: int, policy_config: Dict[str, Any],
|
|
111
|
+
env_config: Dict[str, Any]) -> TaskResult
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
# Required: Unique identifier for this baseline config
|
|
115
|
+
baseline_id: str
|
|
116
|
+
|
|
117
|
+
# Required: Human-readable name
|
|
118
|
+
name: str
|
|
119
|
+
|
|
120
|
+
# Required: Task runner (class or function)
|
|
121
|
+
# Class-based: Pass a class inheriting from BaselineTaskRunner
|
|
122
|
+
# The class will be instantiated with policy_config and env_config,
|
|
123
|
+
# and run_task(seed) will be called for each seed.
|
|
124
|
+
# Function-based: Pass an async function with signature:
|
|
125
|
+
# async def task_runner(seed: int, policy_config: Dict[str, Any],
|
|
126
|
+
# env_config: Dict[str, Any]) -> TaskResult
|
|
127
|
+
task_runner: TaskRunnerType
|
|
128
|
+
|
|
129
|
+
# Required: Data splits (train/val/test)
|
|
130
|
+
splits: Dict[str, DataSplit]
|
|
131
|
+
|
|
132
|
+
# Optional: Description for documentation
|
|
133
|
+
description: str = ""
|
|
134
|
+
|
|
135
|
+
# Optional: Default policy configuration
|
|
136
|
+
default_policy_config: Dict[str, Any] = field(default_factory=dict)
|
|
137
|
+
|
|
138
|
+
# Optional: Default environment configuration
|
|
139
|
+
default_env_config: Dict[str, Any] = field(default_factory=dict)
|
|
140
|
+
|
|
141
|
+
# Optional: Metadata for filtering/organization
|
|
142
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
143
|
+
|
|
144
|
+
# Optional: Tags for filtering and discovery
|
|
145
|
+
tags: List[str] = field(default_factory=list)
|
|
146
|
+
|
|
147
|
+
# Optional: Custom result aggregator (class or function)
|
|
148
|
+
# Class-based: Pass a class with aggregate(results: List[TaskResult]) method
|
|
149
|
+
# The class will be instantiated and aggregate() called.
|
|
150
|
+
# Function-based: Pass a function with signature:
|
|
151
|
+
# def aggregate_results(results: List[TaskResult]) -> Dict[str, Any]
|
|
152
|
+
result_aggregator: Optional[AggregatorType] = None
|
|
153
|
+
|
|
154
|
+
# Optional: Path to this baseline file (set by discovery)
|
|
155
|
+
_source_path: Optional[Path] = None
|
|
156
|
+
|
|
157
|
+
def matches_tag(self, tag: str) -> bool:
|
|
158
|
+
"""Check if baseline matches a tag (case-insensitive)."""
|
|
159
|
+
return tag.lower() in [t.lower() for t in self.tags]
|
|
160
|
+
|
|
161
|
+
def matches_metadata(self, key: str, value: Any) -> bool:
|
|
162
|
+
"""Check if baseline metadata matches key-value pair."""
|
|
163
|
+
return self.metadata.get(key) == value
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class BaselineResults:
|
|
168
|
+
"""Aggregate results from a baseline evaluation."""
|
|
169
|
+
|
|
170
|
+
# Configuration that was used
|
|
171
|
+
config: BaselineConfig
|
|
172
|
+
|
|
173
|
+
# Split that was evaluated
|
|
174
|
+
split_name: str
|
|
175
|
+
|
|
176
|
+
# Per-seed results
|
|
177
|
+
results: List[TaskResult]
|
|
178
|
+
|
|
179
|
+
# Aggregate metrics
|
|
180
|
+
aggregate_metrics: Dict[str, Any]
|
|
181
|
+
|
|
182
|
+
# Execution metadata
|
|
183
|
+
execution_time_seconds: float
|
|
184
|
+
model_name: str
|
|
185
|
+
timestamp: str
|
|
186
|
+
|
|
187
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
188
|
+
"""Serialize to dictionary for JSON output."""
|
|
189
|
+
return {
|
|
190
|
+
"baseline_id": self.config.baseline_id,
|
|
191
|
+
"name": self.config.name,
|
|
192
|
+
"split": self.split_name,
|
|
193
|
+
"model": self.model_name,
|
|
194
|
+
"timestamp": self.timestamp,
|
|
195
|
+
"execution_time_seconds": self.execution_time_seconds,
|
|
196
|
+
"aggregate_metrics": self.aggregate_metrics,
|
|
197
|
+
"results": [
|
|
198
|
+
{
|
|
199
|
+
"seed": r.seed,
|
|
200
|
+
"success": r.success,
|
|
201
|
+
"outcome_reward": r.outcome_reward,
|
|
202
|
+
"total_steps": r.total_steps,
|
|
203
|
+
"metadata": r.metadata,
|
|
204
|
+
"error": r.error,
|
|
205
|
+
}
|
|
206
|
+
for r in self.results
|
|
207
|
+
],
|
|
208
|
+
}
|
|
209
|
+
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""AST-based discovery mechanism for baseline files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import importlib.util
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from synth_ai.baseline.config import BaselineConfig
|
|
12
|
+
|
|
13
|
+
# Search patterns for baseline files
|
|
14
|
+
BASELINE_FILE_PATTERNS = [
|
|
15
|
+
"**/baseline/*.py",
|
|
16
|
+
"**/baselines/*.py",
|
|
17
|
+
"**/*_baseline.py",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
# Directories to ignore during discovery
|
|
21
|
+
IGNORE_PATTERNS = {
|
|
22
|
+
"__pycache__",
|
|
23
|
+
".git",
|
|
24
|
+
".venv",
|
|
25
|
+
"venv",
|
|
26
|
+
"node_modules",
|
|
27
|
+
"build",
|
|
28
|
+
"dist",
|
|
29
|
+
".mypy_cache",
|
|
30
|
+
".pytest_cache",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class BaselineChoice:
|
|
36
|
+
"""Represents a discovered baseline configuration."""
|
|
37
|
+
|
|
38
|
+
baseline_id: str
|
|
39
|
+
path: Path
|
|
40
|
+
lineno: int
|
|
41
|
+
source: str # "discovered" or "registered"
|
|
42
|
+
config: Optional[BaselineConfig] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaselineConfigVisitor(ast.NodeVisitor):
|
|
46
|
+
"""AST visitor to find BaselineConfig instances."""
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
self.matches: List[Tuple[str, int]] = [] # (baseline_id, lineno)
|
|
50
|
+
|
|
51
|
+
def visit_Assign(self, node: ast.Assign) -> None:
|
|
52
|
+
"""Visit assignment statements looking for BaselineConfig."""
|
|
53
|
+
if not isinstance(node.value, ast.Call):
|
|
54
|
+
self.generic_visit(node)
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Check if right-hand side is BaselineConfig(...)
|
|
58
|
+
func = node.value.func
|
|
59
|
+
if isinstance(func, ast.Name) and func.id == "BaselineConfig":
|
|
60
|
+
# Extract baseline_id from constructor args
|
|
61
|
+
baseline_id = self._extract_baseline_id(node.value)
|
|
62
|
+
if baseline_id:
|
|
63
|
+
self.matches.append((baseline_id, node.lineno))
|
|
64
|
+
|
|
65
|
+
self.generic_visit(node)
|
|
66
|
+
|
|
67
|
+
def _extract_baseline_id(self, call_node: ast.Call) -> Optional[str]:
|
|
68
|
+
"""Extract baseline_id from BaselineConfig constructor."""
|
|
69
|
+
for keyword in call_node.keywords:
|
|
70
|
+
if keyword.arg == "baseline_id" and isinstance(keyword.value, ast.Constant):
|
|
71
|
+
return keyword.value.value
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def should_ignore_path(path: Path) -> bool:
|
|
76
|
+
"""Check if a path should be ignored during discovery."""
|
|
77
|
+
return any(part in IGNORE_PATTERNS for part in path.parts)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def discover_baseline_files(search_roots: List[Path]) -> List[BaselineChoice]:
|
|
81
|
+
"""Discover baseline files via AST scanning.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
search_roots: List of root directories to search in
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of BaselineChoice objects representing discovered baselines
|
|
88
|
+
"""
|
|
89
|
+
results: List[BaselineChoice] = []
|
|
90
|
+
seen = set()
|
|
91
|
+
|
|
92
|
+
for root in search_roots:
|
|
93
|
+
if not root.exists():
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
for pattern in BASELINE_FILE_PATTERNS:
|
|
97
|
+
for path in root.glob(pattern):
|
|
98
|
+
if should_ignore_path(path):
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
source = path.read_text(encoding="utf-8")
|
|
103
|
+
tree = ast.parse(source, filename=str(path))
|
|
104
|
+
except (OSError, SyntaxError):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
visitor = BaselineConfigVisitor()
|
|
108
|
+
visitor.visit(tree)
|
|
109
|
+
|
|
110
|
+
for baseline_id, lineno in visitor.matches:
|
|
111
|
+
key = (baseline_id, path.resolve())
|
|
112
|
+
if key in seen:
|
|
113
|
+
continue
|
|
114
|
+
seen.add(key)
|
|
115
|
+
|
|
116
|
+
results.append(
|
|
117
|
+
BaselineChoice(
|
|
118
|
+
baseline_id=baseline_id,
|
|
119
|
+
path=path.resolve(),
|
|
120
|
+
lineno=lineno,
|
|
121
|
+
source="discovered",
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return results
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def load_baseline_config_from_file(
|
|
129
|
+
baseline_id: str,
|
|
130
|
+
path: Path,
|
|
131
|
+
) -> BaselineConfig:
|
|
132
|
+
"""Load a BaselineConfig from a Python file.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
baseline_id: The baseline_id to look for
|
|
136
|
+
path: Path to the Python file
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
BaselineConfig instance
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If baseline_id not found or file cannot be loaded
|
|
143
|
+
"""
|
|
144
|
+
# Load the module
|
|
145
|
+
spec = importlib.util.spec_from_file_location("baseline_module", path)
|
|
146
|
+
if spec is None or spec.loader is None:
|
|
147
|
+
raise ValueError(f"Cannot load baseline file: {path}")
|
|
148
|
+
|
|
149
|
+
module = importlib.util.module_from_spec(spec)
|
|
150
|
+
try:
|
|
151
|
+
spec.loader.exec_module(module)
|
|
152
|
+
except ModuleNotFoundError as e:
|
|
153
|
+
missing_module = str(e).split("'")[1] if "'" in str(e) else str(e)
|
|
154
|
+
raise ImportError(
|
|
155
|
+
f"❌ Missing dependency for baseline '{baseline_id}'\n"
|
|
156
|
+
f" File: {path}\n"
|
|
157
|
+
f" Missing module: {missing_module}\n"
|
|
158
|
+
f" Fix: pip install {missing_module} (or 'uv add {missing_module}')"
|
|
159
|
+
) from e
|
|
160
|
+
except SyntaxError as e:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"❌ Syntax error in baseline file '{baseline_id}'\n"
|
|
163
|
+
f" File: {path}\n"
|
|
164
|
+
f" Error at line {e.lineno}: {e.msg}\n"
|
|
165
|
+
f" Text: {e.text.strip() if e.text else 'N/A'}\n"
|
|
166
|
+
f" Fix: Check the Python syntax in the baseline file"
|
|
167
|
+
) from e
|
|
168
|
+
except Exception as e:
|
|
169
|
+
error_type = type(e).__name__
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"❌ Failed to load baseline '{baseline_id}'\n"
|
|
172
|
+
f" File: {path}\n"
|
|
173
|
+
f" Error type: {error_type}\n"
|
|
174
|
+
f" Message: {str(e)}\n"
|
|
175
|
+
f" This may be due to:\n"
|
|
176
|
+
f" - Missing dependencies (check imports)\n"
|
|
177
|
+
f" - Configuration errors in the baseline file\n"
|
|
178
|
+
f" - Environment variables not set\n"
|
|
179
|
+
f" Tip: Run with --verbose for more details"
|
|
180
|
+
) from e
|
|
181
|
+
|
|
182
|
+
# Find the BaselineConfig instance
|
|
183
|
+
for attr_name in dir(module):
|
|
184
|
+
if attr_name.startswith("_"):
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
attr = getattr(module, attr_name)
|
|
188
|
+
if isinstance(attr, BaselineConfig) and attr.baseline_id == baseline_id:
|
|
189
|
+
# Set source path for reference
|
|
190
|
+
attr._source_path = path
|
|
191
|
+
return attr
|
|
192
|
+
|
|
193
|
+
# Provide helpful error message
|
|
194
|
+
found_configs = []
|
|
195
|
+
for attr_name in dir(module):
|
|
196
|
+
if attr_name.startswith("_"):
|
|
197
|
+
continue
|
|
198
|
+
attr = getattr(module, attr_name)
|
|
199
|
+
if isinstance(attr, BaselineConfig):
|
|
200
|
+
found_configs.append(attr.baseline_id)
|
|
201
|
+
|
|
202
|
+
if found_configs:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"❌ Baseline '{baseline_id}' not found in {path}\n"
|
|
205
|
+
f" Found baselines in this file: {', '.join(found_configs)}\n"
|
|
206
|
+
f" Fix: Use one of the above baseline IDs or check the baseline_id parameter"
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"❌ No BaselineConfig instances found in {path}\n"
|
|
211
|
+
f" Expected to find a BaselineConfig with baseline_id='{baseline_id}'\n"
|
|
212
|
+
f" Fix: Ensure the file defines a BaselineConfig instance with baseline_id='{baseline_id}'"
|
|
213
|
+
)
|
|
214
|
+
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Execution engine for baseline evaluations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from synth_ai.baseline.config import (
|
|
9
|
+
BaselineConfig,
|
|
10
|
+
BaselineTaskRunner,
|
|
11
|
+
TaskResult,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def default_aggregator(results: List[TaskResult]) -> Dict[str, Any]:
|
|
16
|
+
"""Default result aggregation function.
|
|
17
|
+
|
|
18
|
+
Computes mean, std, min, max, success rate, and other basic metrics.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
results: List of TaskResult objects from all seeds
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Dict with aggregate metrics
|
|
25
|
+
"""
|
|
26
|
+
successful_results = [r for r in results if r.success]
|
|
27
|
+
outcome_rewards = [r.outcome_reward for r in successful_results]
|
|
28
|
+
|
|
29
|
+
if not outcome_rewards:
|
|
30
|
+
return {
|
|
31
|
+
"mean_outcome_reward": 0.0,
|
|
32
|
+
"std_outcome_reward": 0.0,
|
|
33
|
+
"min_outcome_reward": 0.0,
|
|
34
|
+
"max_outcome_reward": 0.0,
|
|
35
|
+
"success_rate": 0.0,
|
|
36
|
+
"total_tasks": len(results),
|
|
37
|
+
"successful_tasks": 0,
|
|
38
|
+
"failed_tasks": len(results),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
mean_reward = sum(outcome_rewards) / len(outcome_rewards)
|
|
42
|
+
|
|
43
|
+
# Calculate standard deviation
|
|
44
|
+
variance = sum((x - mean_reward) ** 2 for x in outcome_rewards) / len(outcome_rewards)
|
|
45
|
+
std_reward = variance ** 0.5
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
"mean_outcome_reward": mean_reward,
|
|
49
|
+
"std_outcome_reward": std_reward,
|
|
50
|
+
"min_outcome_reward": min(outcome_rewards),
|
|
51
|
+
"max_outcome_reward": max(outcome_rewards),
|
|
52
|
+
"success_rate": len(successful_results) / len(results),
|
|
53
|
+
"total_tasks": len(results),
|
|
54
|
+
"successful_tasks": len(successful_results),
|
|
55
|
+
"failed_tasks": len(results) - len(successful_results),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _is_class_based_runner(task_runner: Any) -> bool:
|
|
60
|
+
"""Check if task_runner is a class (not a function)."""
|
|
61
|
+
return (
|
|
62
|
+
isinstance(task_runner, type)
|
|
63
|
+
and issubclass(task_runner, BaselineTaskRunner)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def run_baseline_evaluation(
|
|
68
|
+
config: BaselineConfig,
|
|
69
|
+
seeds: List[int],
|
|
70
|
+
policy_config: Dict[str, Any],
|
|
71
|
+
env_config: Dict[str, Any],
|
|
72
|
+
concurrency: int = 4,
|
|
73
|
+
) -> List[TaskResult]:
|
|
74
|
+
"""Run baseline evaluation for given seeds.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
config: BaselineConfig instance
|
|
78
|
+
seeds: List of seeds to evaluate
|
|
79
|
+
policy_config: Policy configuration (merged from defaults + overrides)
|
|
80
|
+
env_config: Environment configuration (merged from defaults + overrides)
|
|
81
|
+
concurrency: Maximum concurrent task executions
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of TaskResult objects, one per seed
|
|
85
|
+
"""
|
|
86
|
+
# Determine if we're using class-based or function-based runner
|
|
87
|
+
is_class_based = _is_class_based_runner(config.task_runner)
|
|
88
|
+
|
|
89
|
+
# Instantiate runner if class-based
|
|
90
|
+
runner_instance: Optional[BaselineTaskRunner] = None
|
|
91
|
+
if is_class_based:
|
|
92
|
+
runner_instance = config.task_runner(policy_config, env_config)
|
|
93
|
+
|
|
94
|
+
# Create semaphore for concurrency control
|
|
95
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
96
|
+
|
|
97
|
+
async def run_task(seed: int) -> TaskResult:
|
|
98
|
+
"""Execute a single task with error handling."""
|
|
99
|
+
async with semaphore:
|
|
100
|
+
try:
|
|
101
|
+
if is_class_based and runner_instance:
|
|
102
|
+
# Class-based: call run_task method
|
|
103
|
+
return await runner_instance.run_task(seed)
|
|
104
|
+
else:
|
|
105
|
+
# Function-based: call function directly
|
|
106
|
+
task_runner_fn = config.task_runner
|
|
107
|
+
return await task_runner_fn(seed, policy_config, env_config)
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
# Return error result
|
|
110
|
+
return TaskResult(
|
|
111
|
+
seed=seed,
|
|
112
|
+
success=False,
|
|
113
|
+
outcome_reward=0.0,
|
|
114
|
+
error=str(exc),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Execute all tasks concurrently
|
|
118
|
+
results = await asyncio.gather(*[run_task(seed) for seed in seeds])
|
|
119
|
+
return list(results)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def aggregate_results(
|
|
123
|
+
config: BaselineConfig,
|
|
124
|
+
results: List[TaskResult],
|
|
125
|
+
) -> Dict[str, Any]:
|
|
126
|
+
"""Aggregate results using custom aggregator or default.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
config: BaselineConfig instance
|
|
130
|
+
results: List of TaskResult objects
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict with aggregate metrics
|
|
134
|
+
"""
|
|
135
|
+
if config.result_aggregator is None:
|
|
136
|
+
return default_aggregator(results)
|
|
137
|
+
|
|
138
|
+
# Check if aggregator is a class or function
|
|
139
|
+
if isinstance(config.result_aggregator, type):
|
|
140
|
+
# Class-based: instantiate and call aggregate()
|
|
141
|
+
aggregator_instance = config.result_aggregator()
|
|
142
|
+
return aggregator_instance.aggregate(results)
|
|
143
|
+
else:
|
|
144
|
+
# Function-based: call directly
|
|
145
|
+
return config.result_aggregator(results)
|
|
146
|
+
|
synth_ai/cli/__init__.py
CHANGED
|
@@ -52,9 +52,77 @@ if not _cli_module:
|
|
|
52
52
|
raise ImportError("synth_ai.cli.root is required for CLI entrypoint")
|
|
53
53
|
cli = _cli_module.cli # type: ignore[attr-defined]
|
|
54
54
|
|
|
55
|
+
# Register core commands implemented as standalone modules
|
|
56
|
+
try:
|
|
57
|
+
from synth_ai.cli.demo import demo_cmd
|
|
58
|
+
cli.add_command(demo_cmd, name="demo")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
import sys
|
|
61
|
+
print(f"[DEBUG] Failed to register demo command: {e}", file=sys.stderr)
|
|
62
|
+
import traceback
|
|
63
|
+
traceback.print_exc()
|
|
64
|
+
try:
|
|
65
|
+
from synth_ai.cli.setup import setup_cmd
|
|
66
|
+
cli.add_command(setup_cmd, name="setup")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
import sys
|
|
69
|
+
print(f"[DEBUG] Failed to register setup command: {e}", file=sys.stderr)
|
|
70
|
+
import traceback
|
|
71
|
+
traceback.print_exc()
|
|
72
|
+
try:
|
|
73
|
+
from synth_ai.cli.deploy import deploy_cmd # type: ignore[attr-defined]
|
|
74
|
+
cli.add_command(deploy_cmd, name="deploy")
|
|
75
|
+
except Exception as e:
|
|
76
|
+
import sys
|
|
77
|
+
print(f"[DEBUG] Failed to register deploy command: {e}", file=sys.stderr)
|
|
78
|
+
import traceback
|
|
79
|
+
traceback.print_exc()
|
|
80
|
+
try:
|
|
81
|
+
from synth_ai.cli.opencode import opencode_cmd
|
|
82
|
+
cli.add_command(opencode_cmd, name="opencode")
|
|
83
|
+
except Exception as e:
|
|
84
|
+
import sys
|
|
85
|
+
print(f"[DEBUG] Failed to register opencode command: {e}", file=sys.stderr)
|
|
86
|
+
import traceback
|
|
87
|
+
traceback.print_exc()
|
|
88
|
+
try:
|
|
89
|
+
from synth_ai.cli.codex import codex_cmd
|
|
90
|
+
cli.add_command(codex_cmd, name="codex")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
import sys
|
|
93
|
+
print(f"[DEBUG] Failed to register codex command: {e}", file=sys.stderr)
|
|
94
|
+
import traceback
|
|
95
|
+
traceback.print_exc()
|
|
96
|
+
try:
|
|
97
|
+
from synth_ai.cli.eval import command as eval_cmd
|
|
98
|
+
cli.add_command(eval_cmd, name="eval")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
import sys
|
|
101
|
+
print(f"[DEBUG] Failed to register eval command: {e}", file=sys.stderr)
|
|
102
|
+
import traceback
|
|
103
|
+
traceback.print_exc()
|
|
104
|
+
try:
|
|
105
|
+
from synth_ai.cli.claude import claude_cmd
|
|
106
|
+
cli.add_command(claude_cmd, name="claude")
|
|
107
|
+
except Exception as e:
|
|
108
|
+
import sys
|
|
109
|
+
print(f"[DEBUG] Failed to register claude command: {e}", file=sys.stderr)
|
|
110
|
+
import traceback
|
|
111
|
+
traceback.print_exc()
|
|
112
|
+
try:
|
|
113
|
+
from synth_ai.cli.commands.baseline import command as baseline_cmd
|
|
114
|
+
from synth_ai.cli.commands.baseline.list import list_command as baseline_list_cmd
|
|
115
|
+
cli.add_command(baseline_cmd, name="baseline")
|
|
116
|
+
baseline_cmd.add_command(baseline_list_cmd, name="list")
|
|
117
|
+
except Exception as e:
|
|
118
|
+
import sys
|
|
119
|
+
print(f"[DEBUG] Failed to register baseline command: {e}", file=sys.stderr)
|
|
120
|
+
import traceback
|
|
121
|
+
traceback.print_exc()
|
|
122
|
+
|
|
55
123
|
|
|
56
124
|
# Register optional subcommands packaged under synth_ai.cli.*
|
|
57
|
-
for _module_path in ("synth_ai.cli.demo", "synth_ai.cli.turso"):
|
|
125
|
+
for _module_path in ("synth_ai.cli.commands.demo", "synth_ai.cli.commands.status", "synth_ai.cli.turso"):
|
|
58
126
|
module = _maybe_import(_module_path)
|
|
59
127
|
if not module:
|
|
60
128
|
continue
|
|
@@ -64,27 +132,35 @@ for _module_path in ("synth_ai.cli.demo", "synth_ai.cli.turso"):
|
|
|
64
132
|
if fn:
|
|
65
133
|
fn(cli)
|
|
66
134
|
|
|
135
|
+
# Smoke command registration (CLI-only helper)
|
|
136
|
+
try:
|
|
137
|
+
from synth_ai.cli.commands.smoke import register as register_smoke
|
|
138
|
+
|
|
139
|
+
register_smoke(cli)
|
|
140
|
+
except Exception:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
# Register help command
|
|
144
|
+
_maybe_call("synth_ai.cli.commands.help.core", "register", cli)
|
|
145
|
+
|
|
67
146
|
# Train CLI lives under synth_ai.api.train
|
|
68
147
|
_maybe_call("synth_ai.api.train", "register", cli)
|
|
69
148
|
|
|
70
149
|
# Task app group/commands are optional and have richer API surface
|
|
71
150
|
_task_apps_module = _maybe_import("synth_ai.cli.task_apps")
|
|
72
|
-
if _task_apps_module:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# Register TUI command if dependencies allow
|
|
88
|
-
_maybe_call("synth_ai.cli.tui", "register", cli)
|
|
151
|
+
#if _task_apps_module:
|
|
152
|
+
task_app_group = getattr(_task_apps_module, "task_app_group", None)
|
|
153
|
+
if task_app_group is not None:
|
|
154
|
+
cli.add_command(task_app_group, name="task-app")
|
|
155
|
+
# Expose common aliases when present
|
|
156
|
+
commands = getattr(task_app_group, "commands", None)
|
|
157
|
+
if isinstance(commands, dict):
|
|
158
|
+
for alias, name in (("serve", "serve"), ("deploy", "deploy"), ("modal-serve", "modal-serve")):
|
|
159
|
+
command = commands.get(name)
|
|
160
|
+
if command is not None:
|
|
161
|
+
cli.add_command(command, name=alias)
|
|
162
|
+
register_task_apps = _callable_from(_task_apps_module, "register")
|
|
163
|
+
if register_task_apps:
|
|
164
|
+
register_task_apps(cli)
|
|
89
165
|
|
|
90
166
|
# Top-level 'info' alias removed; use `synth-ai task-app info` instead
|
synth_ai/cli/__main__.py
ADDED
|
File without changes
|