PyPI - synth-ai - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl - Mend

synth-ai 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (192) hide show

examples/analyze_semantic_words.sh +2 -2
examples/blog_posts/pokemon_vl/README.md +98 -0
examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +25 -0
examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +42 -0
examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
examples/blog_posts/warming_up_to_rl/README.md +158 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +41 -0
examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
examples/multi_step/configs/crafter_rl_outcome.toml +1 -1
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -1
examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -1
examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
examples/multi_step/configs/verilog_rl_lora.toml +80 -123
examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
examples/qwen_coder/configs/coder_lora_small.toml +1 -3
examples/qwen_vl/README.md +10 -12
examples/qwen_vl/SETUP_COMPLETE.md +7 -8
examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
examples/qwen_vl/collect_data_via_cli.md +76 -84
examples/qwen_vl/collect_vision_traces.py +4 -4
examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
examples/qwen_vl/run_vision_comparison.sh +6 -7
examples/rl/README.md +5 -5
examples/rl/configs/rl_from_base_qwen.toml +26 -1
examples/rl/configs/rl_from_base_qwen17.toml +5 -2
examples/rl/task_app/README.md +1 -2
examples/rl/task_app/math_single_step.py +2 -2
examples/run_crafter_demo.sh +2 -2
examples/sft/README.md +1 -1
examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
examples/swe/task_app/README.md +32 -2
examples/swe/task_app/grpo_swe_mini.py +4 -0
examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
examples/swe/task_app/hosted/inference/openai_client.py +4 -4
examples/swe/task_app/morph_backend.py +178 -0
examples/task_apps/crafter/task_app/README.md +1 -1
examples/task_apps/crafter/task_app/grpo_crafter.py +66 -3
examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +17 -49
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +13 -5
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +15 -1
examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
examples/task_apps/math/README.md +1 -2
examples/task_apps/pokemon_red/README.md +3 -4
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
examples/task_apps/pokemon_red/task_app.py +36 -5
examples/task_apps/sokoban/README.md +2 -3
examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -2
examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
examples/warming_up_to_rl/task_app/README.md +1 -1
examples/warming_up_to_rl/task_app/grpo_crafter.py +134 -3
examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +4 -4
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +6 -3
examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +5 -0
synth_ai/api/train/builders.py +9 -3
synth_ai/api/train/cli.py +125 -10
synth_ai/api/train/configs/__init__.py +8 -1
synth_ai/api/train/configs/rl.py +32 -7
synth_ai/api/train/configs/sft.py +6 -2
synth_ai/api/train/configs/shared.py +59 -2
synth_ai/auth/credentials.py +119 -0
synth_ai/cli/__init__.py +12 -4
synth_ai/cli/commands/__init__.py +17 -0
synth_ai/cli/commands/demo/__init__.py +6 -0
synth_ai/cli/commands/demo/core.py +163 -0
synth_ai/cli/commands/deploy/__init__.py +23 -0
synth_ai/cli/commands/deploy/core.py +614 -0
synth_ai/cli/commands/deploy/errors.py +72 -0
synth_ai/cli/commands/deploy/validation.py +11 -0
synth_ai/cli/commands/eval/__init__.py +19 -0
synth_ai/cli/commands/eval/core.py +1109 -0
synth_ai/cli/commands/eval/errors.py +81 -0
synth_ai/cli/commands/eval/validation.py +133 -0
synth_ai/cli/commands/filter/__init__.py +12 -0
synth_ai/cli/commands/filter/core.py +388 -0
synth_ai/cli/commands/filter/errors.py +55 -0
synth_ai/cli/commands/filter/validation.py +77 -0
synth_ai/cli/commands/help/__init__.py +177 -0
synth_ai/cli/commands/help/core.py +73 -0
synth_ai/cli/commands/status/__init__.py +64 -0
synth_ai/cli/commands/status/client.py +192 -0
synth_ai/cli/commands/status/config.py +92 -0
synth_ai/cli/commands/status/errors.py +20 -0
synth_ai/cli/commands/status/formatters.py +164 -0
synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
synth_ai/cli/commands/status/subcommands/files.py +79 -0
synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
synth_ai/cli/commands/status/subcommands/models.py +79 -0
synth_ai/cli/commands/status/subcommands/runs.py +81 -0
synth_ai/cli/commands/status/subcommands/summary.py +47 -0
synth_ai/cli/commands/status/utils.py +114 -0
synth_ai/cli/commands/train/__init__.py +53 -0
synth_ai/cli/commands/train/core.py +21 -0
synth_ai/cli/commands/train/errors.py +117 -0
synth_ai/cli/commands/train/judge_schemas.py +199 -0
synth_ai/cli/commands/train/judge_validation.py +304 -0
synth_ai/cli/commands/train/validation.py +443 -0
synth_ai/cli/demo.py +2 -162
synth_ai/cli/deploy/__init__.py +28 -0
synth_ai/cli/deploy/core.py +5 -0
synth_ai/cli/deploy/errors.py +23 -0
synth_ai/cli/deploy/validation.py +5 -0
synth_ai/cli/eval/__init__.py +36 -0
synth_ai/cli/eval/core.py +5 -0
synth_ai/cli/eval/errors.py +31 -0
synth_ai/cli/eval/validation.py +5 -0
synth_ai/cli/filter/__init__.py +28 -0
synth_ai/cli/filter/core.py +5 -0
synth_ai/cli/filter/errors.py +23 -0
synth_ai/cli/filter/validation.py +5 -0
synth_ai/cli/modal_serve/__init__.py +12 -0
synth_ai/cli/modal_serve/core.py +14 -0
synth_ai/cli/modal_serve/errors.py +8 -0
synth_ai/cli/modal_serve/validation.py +11 -0
synth_ai/cli/serve/__init__.py +12 -0
synth_ai/cli/serve/core.py +14 -0
synth_ai/cli/serve/errors.py +8 -0
synth_ai/cli/serve/validation.py +11 -0
synth_ai/cli/setup.py +20 -265
synth_ai/cli/status.py +7 -126
synth_ai/cli/task_app_deploy.py +1 -10
synth_ai/cli/task_app_modal_serve.py +4 -9
synth_ai/cli/task_app_serve.py +4 -11
synth_ai/cli/task_apps.py +58 -1487
synth_ai/cli/train/__init__.py +12 -0
synth_ai/cli/train/core.py +21 -0
synth_ai/cli/train/errors.py +8 -0
synth_ai/cli/train/validation.py +24 -0
synth_ai/cli/train.py +1 -14
synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/red/engine.py +33 -12
synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
synth_ai/environments/examples/red/environment.py +26 -0
synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
synth_ai/http.py +12 -0
synth_ai/judge_schemas.py +10 -11
synth_ai/learning/rl/client.py +3 -1
synth_ai/streaming/__init__.py +29 -0
synth_ai/streaming/config.py +94 -0
synth_ai/streaming/handlers.py +469 -0
synth_ai/streaming/streamer.py +301 -0
synth_ai/streaming/types.py +95 -0
synth_ai/task/validators.py +2 -2
synth_ai/tracing_v3/migration_helper.py +1 -2
synth_ai/utils/env.py +25 -18
synth_ai/utils/http.py +4 -1
synth_ai/utils/modal.py +2 -2
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/METADATA +8 -3
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/RECORD +184 -109
examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
synth_ai/cli/tui.py +0 -62
synth_ai/tui/__init__.py +0 -5
synth_ai/tui/__main__.py +0 -13
synth_ai/tui/cli/__init__.py +0 -1
synth_ai/tui/cli/query_experiments.py +0 -164
synth_ai/tui/cli/query_experiments_v3.py +0 -164
synth_ai/tui/dashboard.py +0 -911
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/WHEEL +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/top_level.txt +0 -0

synth_ai/cli/train/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+from .core import register, train_command
+from .errors import TrainCliError
+from .validation import validate_train_environment
+__all__ = [
+    "register",
+    "train_command",
+    "TrainCliError",
+    "validate_train_environment",
+]

synth_ai/cli/train/core.py ADDED Viewed

@@ -0,0 +1,21 @@
+from __future__ import annotations
+import click
+from synth_ai.api.train.cli import (
+    register as _register_with_cli,
+)
+from synth_ai.api.train.cli import (
+    train_command as _train_command,
+)
+__all__ = ["register", "train_command"]
+def register(cli: click.Group) -> None:
+    """Attach the train command to the root CLI."""
+    _register_with_cli(cli)
+def train_command(*args, **kwargs):
+    """Entrypoint used by the train CLI command."""
+    return _train_command(*args, **kwargs)

synth_ai/cli/train/errors.py ADDED Viewed

@@ -0,0 +1,8 @@
+from __future__ import annotations
+class TrainCliError(RuntimeError):
+    """Base exception for train CLI failures."""
+__all__ = ["TrainCliError"]

synth_ai/cli/train/validation.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Dict, Tuple
+from synth_ai.api.train.env_resolver import KeySpec, resolve_env
+__all__ = ["validate_train_environment"]
+def validate_train_environment(
+    *,
+    config_path: Path | None,
+    explicit_env_paths: Iterable[str],
+    required_keys: list[KeySpec],
+) -> Tuple[Path, Dict[str, str]]:
+    """Validate and resolve environment secrets used by the train command."""
+    resolved_path, resolved_keys = resolve_env(
+        config_path=config_path,
+        explicit_env_paths=explicit_env_paths,
+        required_keys=required_keys,
+    )
+    return resolved_path, resolved_keys

synth_ai/cli/train.py CHANGED Viewed

@@ -1,18 +1,5 @@
 from __future__ import annotations
-from typing import Any
-from synth_ai.api.train.cli import register as _register
-from synth_ai.api.train.cli import train_command as _train_command
+from synth_ai.cli.commands.train.core import register, train_command
 __all__ = ["register", "train_command"]
-def register(cli: Any) -> None:
-    """Compatibility wrapper for the legacy train CLI location."""
-    _register(cli)
-def train_command(*args: Any, **kwargs: Any) -> Any:
-    return _train_command(*args, **kwargs)

synth_ai/demos/crafter/grpo_crafter_task_app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 This module now delegates to the TaskAppConfig defined in the local example at
 `examples/warming_up_to_rl/task_app/grpo_crafter.py`. It is kept for legacy usage
 (running the file directly or targeting `fastapi_app` from external tooling).
-Prefer using `uvx synth-ai serve grpo-crafter` for local development and testing.
+Prefer using `uvx synth-ai deploy --runtime uvicorn grpo-crafter` for local development and testing.
 """
 from __future__ import annotations

synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 This module now delegates to the TaskAppConfig defined in the local example at
 `examples/task_apps/crafter/task_app/grpo_crafter.py`. It is kept for legacy usage
 (running the file directly or targeting `fastapi_app` from external tooling).
-Prefer using `uvx synth-ai serve grpo-crafter` for local development and testing.
+Prefer using `uvx synth-ai deploy --runtime uvicorn grpo-crafter` for local development and testing.
 """
 from __future__ import annotations

synth_ai/environments/examples/red/engine.py CHANGED Viewed

@@ -14,12 +14,15 @@ from synth_ai.environments.stateful.engine import StatefulEngine, StatefulEngine
 from synth_ai.environments.tasks.core import TaskInstance
 from .engine_helpers.reward_components import (
-    BadgeRewardComponent,
-    BattleVictoryComponent,
-    LevelUpComponent,
-    MapTransitionComponent,
+    RouteExplorationReward,
+    StrategicTrainingReward,
+    BattleProgressionReward,
+    GymPreparationReward,
+    ItemCollectionReward,
+    HealingManagementReward,
+    EfficientExplorationReward,
+    BadgeVictoryReward,
     StepPenaltyComponent,
-    XPGainComponent,
 )
 from .engine_helpers.state_extraction import extract_game_state
@@ -268,15 +271,27 @@ class PokemonRedEngine(StatefulEngine, IReproducibleEngine):
             # For testing purposes, use None emulator
             self.emulator = None
-        # Initialize reward stack with dense components
+        # Initialize reward stack with comprehensive progress-based components
         self.reward_stack = RewardStack(
             components=[
-                BadgeRewardComponent(),
-                MapTransitionComponent(),
-                BattleVictoryComponent(),
-                LevelUpComponent(),
-                XPGainComponent(),
-                StepPenaltyComponent(),
+                # Major progress rewards
+                BadgeVictoryReward(),        # +50.0 for Boulder Badge (main goal)
+                RouteExplorationReward(),    # +1.0-5.0 for reaching key areas
+                GymPreparationReward(),      # +3.0 for being gym-ready
+                # Training and battle rewards
+                StrategicTrainingReward(),   # +0.2-3.0 for level ups and milestones
+                BattleProgressionReward(),   # +0.1-1.0 for battles
+                # Resource management rewards
+                ItemCollectionReward(),      # +0.1-0.5 for collecting items
+                HealingManagementReward(),   # +0.05-0.8 for healing Pokemon
+                # Exploration efficiency
+                EfficientExplorationReward(), # +0.02 for discovering new positions
+                        # No penalty for unproductive actions
+                        StepPenaltyComponent(penalty=0.0),        # 0.0 per step
             ]
         )
@@ -640,6 +655,12 @@ class PokemonRedEngine(StatefulEngine, IReproducibleEngine):
                         "prev_text_box_active": bool(prev_state.get("text_box_active", False)),
                         "prev_enemy_hp_current": int(prev_state.get("enemy_hp_current", 0)),
                         "prev_enemy_hp_percentage": float(prev_state.get("enemy_hp_percentage", 0.0)),
+                        "prev_player_x": int(prev_state.get("player_x", 0)),
+                        "prev_player_y": int(prev_state.get("player_y", 0)),
+                        "prev_party": prev_state.get("party", []),
+                        "prev_inventory": prev_state.get("inventory", []),
+                        "prev_party_hp_current": int(prev_state.get("party_hp_current", 0)),
+                        "prev_party_hp_max": int(prev_state.get("party_hp_max", 0)),
                     },
                 )
             except Exception as e:

synth_ai/environments/examples/red/engine_helpers/reward_components.py CHANGED Viewed

@@ -3,274 +3,246 @@ from typing import Any, Dict, Set
 from synth_ai.environments.environment.rewards.core import RewardComponent
-class BadgeRewardComponent(RewardComponent):
-    """Reward for earning gym badges"""
+# ===== COMPREHENSIVE POKEMON RED PROGRESS REWARD SYSTEM =====
+# Designed for deterministic rewards that guide toward beating Brock at Pewter Gym
-    async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        prev_badges = action.get("prev_badges", 0)
-        current_badges = state["badges"]
-        new_badges = current_badges & ~prev_badges
-        badge_count = bin(new_badges).count("1")
-        return badge_count * 1.0
+class RouteExplorationReward(RewardComponent):
+    """High rewards for reaching key areas on the path to Pewter Gym - guides exploration"""
-class MapTransitionComponent(RewardComponent):
-    """Reward for exploring new areas"""
+    def __init__(self):
+        self.key_areas_reached: Set[int] = set()
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        prev_map = action.get("prev_map_id", -1)
         current_map = state["map_id"]
-        return 0.1 if current_map != prev_map else 0.0
+        prev_map = action.get("prev_map_id", -1)
-class BattleVictoryComponent(RewardComponent):
-    """Reward for winning battles"""
+        # Key maps and rewards for progressing toward Pewter Gym
+        area_rewards = {
+            0: 0.0,  # Pallet Town (starting point)
+            1: 2.0,  # Route 1 - First step out of town (+2.0)
+            2: 1.5,  # Viridian City - Major hub (+1.5)
+            3: 1.0,  # Route 22 - Path to League (+1.0)
+            4: 1.0,  # Route 2 - To Viridian Forest (+1.0)
+            5: 2.0,  # Viridian Forest - Dense area (+2.0)
+            6: 1.5,  # Pewter City - Target city (+1.5)
+            7: 5.0,  # Pewter Gym - GOAL AREA (+5.0 for entering gym)
+        }
+        if current_map in area_rewards and current_map not in self.key_areas_reached:
+            if prev_map != current_map:  # Only reward when actually entering new area
+                self.key_areas_reached.add(current_map)
+                return area_rewards[current_map]
-    async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        prev_in_battle = action.get("prev_in_battle", False)
-        current_in_battle = state["in_battle"]
-        battle_outcome = state["battle_outcome"]
-        # Transitioning from battle to not in battle with victory
-        if prev_in_battle and not current_in_battle and battle_outcome == 1:
-            return 0.5
         return 0.0
-class LevelUpComponent(RewardComponent):
-    """Reward for Pokemon leveling up"""
+class StrategicTrainingReward(RewardComponent):
+    """Rewards for building Pokemon strength strategically"""
+    def __init__(self):
+        self.level_milestones: Set[int] = set()
+        self.last_level = 0
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
+        current_level = state.get("party_level", 0)
         prev_level = action.get("prev_party_level", 0)
-        current_level = state["party_level"]
-        level_gain = max(0, current_level - prev_level)
-        return level_gain * 0.3
+        # Reward reaching key level milestones
+        milestone_rewards = {
+            8: 1.0,   # Level 8 - Good for early battles
+            12: 2.0,  # Level 12 - Ready for Brock
+            15: 3.0,  # Level 15 - Strong Pokemon
+        }
-class XPGainComponent(RewardComponent):
-    """Small reward for XP gains"""
+        if current_level > prev_level and current_level in milestone_rewards:
+            if current_level not in self.level_milestones:
+                self.level_milestones.add(current_level)
+                return milestone_rewards[current_level]
-    async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        prev_xp = action.get("prev_party_xp", 0)
-        current_xp = state["party_xp"]
-        xp_gain = max(0, current_xp - prev_xp)
-        return xp_gain * 0.001  # Very small multiplier
+        # Small reward for any level up (0.2 points)
+        if current_level > prev_level:
+            return 0.2
+        return 0.0
-class StepPenaltyComponent(RewardComponent):
-    """Small penalty for each step to encourage efficiency"""
-    def __init__(self, penalty: float = -0.001):
-        self.penalty = penalty
+class BattleProgressionReward(RewardComponent):
+    """Rewards for winning battles and gaining experience"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        return self.penalty
+        prev_in_battle = action.get("prev_in_battle", False)
+        current_in_battle = state.get("in_battle", False)
+        battle_outcome = state.get("battle_outcome", 0)
+        # Large reward for battle victory (+1.0)
+        if prev_in_battle and not current_in_battle and battle_outcome == 1:
+            return 1.0
-class MenuPenaltyComponent(RewardComponent):
-    """Penalty for excessive menu usage"""
+        # Small reward for entering battle (+0.1) - shows engagement
+        if not prev_in_battle and current_in_battle:
+            return 0.1
-    async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        # This would need more sophisticated menu tracking
         return 0.0
-# ===== NEW EARLY GAME PALLET TOWN REWARDS =====
-class ExitHouseReward(RewardComponent):
-    """High reward for first time leaving the starting house - +2.0 points"""
+class GymPreparationReward(RewardComponent):
+    """Rewards for preparing to challenge Brock"""
     def __init__(self):
-        self.house_exited = False
+        self.prepared_for_gym = False
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.house_exited:
+        if self.prepared_for_gym:
             return 0.0
-        prev_map = action.get("prev_map_id", -1)
-        current_map = state["map_id"]
+        # Check if in Pewter City area and have decent Pokemon
+        if state["map_id"] in [6, 7]:  # Pewter City or Gym
+            party_level = state.get("party_level", 0)
+            party_count = len(state.get("party", []))
+            # Reward being prepared for gym battle
+            if party_level >= 10 and party_count >= 1:
+                self.prepared_for_gym = True
+                return 3.0  # Significant reward for being gym-ready
-        # Exit from house to town (assuming house maps are 1,2 and town is 0)
-        if prev_map in [1, 2] and current_map == 0:
-            self.house_exited = True
-            return 2.0
         return 0.0
-class NPCInteractionReward(RewardComponent):
-    """Reward for talking to NPCs - +0.8 points per unique NPC"""
+class ItemCollectionReward(RewardComponent):
+    """Rewards for collecting useful items"""
     def __init__(self):
-        self.npcs_talked_to: Set[tuple] = set()
+        self.items_collected: Set[int] = set()
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        # Detect NPC conversations
-        if state["text_box_active"] and not action.get("prev_text_box_active", False):
-            # Use position as NPC identifier
-            npc_key = (state["player_x"], state["player_y"], state["map_id"])
-            if npc_key not in self.npcs_talked_to:
-                self.npcs_talked_to.add(npc_key)
-                return 0.8
-        return 0.0
+        prev_inventory = action.get("prev_inventory", [])
+        current_inventory = state.get("inventory", [])
+        # Check for new items
+        prev_item_ids = {item["item_id"] for item in prev_inventory}
+        current_item_ids = {item["item_id"] for item in current_inventory}
-class OakLabDiscoveryReward(RewardComponent):
-    """High reward for finding and entering Oak's lab - +2.5 points"""
+        new_items = current_item_ids - prev_item_ids
-    def __init__(self):
-        self.lab_discovered = False
+        # Reward valuable items for gym preparation
+        valuable_items = {1, 2, 3, 4, 5, 10, 11, 12, 13}  # Potions, Balls, etc.
+        reward = 0.0
+        for item_id in new_items:
+            if item_id not in self.items_collected:
+                self.items_collected.add(item_id)
+                if item_id in valuable_items:
+                    reward += 0.5  # +0.5 per valuable item
+                else:
+                    reward += 0.1  # +0.1 per other item
+        return reward
+class HealingManagementReward(RewardComponent):
+    """Rewards for keeping Pokemon healthy"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.lab_discovered:
+        prev_party = action.get("prev_party", [])
+        current_party = state.get("party", [])
+        if not prev_party or not current_party:
             return 0.0
-        prev_map = action.get("prev_map_id", -1)
-        current_map = state["map_id"]
+        # Reward healing Pokemon back to full health
+        prev_hp_pct = sum(p.get("hp_percentage", 0) for p in prev_party) / len(prev_party)
+        current_hp_pct = sum(p.get("hp_percentage", 0) for p in current_party) / len(current_party)
+        # Significant improvement in health
+        if current_hp_pct > prev_hp_pct + 20:  # Healed at least 20% overall
+            return 0.8
+        # Small reward for maintaining good health
+        if current_hp_pct >= 80 and prev_hp_pct >= 80:
+            return 0.05
-        # Entering Oak's lab (assuming map 3)
-        if prev_map == 0 and current_map == 3:
-            self.lab_discovered = True
-            return 2.5
         return 0.0
-class StarterPokemonReward(RewardComponent):
-    """Very high reward for getting first Pokemon - +10.0 points"""
+class EfficientExplorationReward(RewardComponent):
+    """Rewards for exploring efficiently without getting lost"""
     def __init__(self):
-        self.starter_obtained = False
+        self.positions_visited: Set[tuple] = set()
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.starter_obtained:
-            return 0.0
+        # Track unique positions visited in each map
+        position_key = (state["map_id"], state["player_x"], state["player_y"])
-        # Detect getting first Pokemon
-        prev_party_count = len(action.get("prev_party", []))
-        current_party_count = len(state.get("party", []))
+        if position_key not in self.positions_visited:
+            self.positions_visited.add(position_key)
+            return 0.02  # Small reward for discovering new areas
-        if prev_party_count == 0 and current_party_count == 1:
-            if state["map_id"] == 3:  # In Oak's lab
-                self.starter_obtained = True
-                return 10.0
         return 0.0
-class FirstBattleReward(RewardComponent):
-    """High reward for engaging in first battle - +5.0 points"""
-    def __init__(self):
-        self.first_battle = False
+class BadgeVictoryReward(RewardComponent):
+    """HUGE reward for achieving the main goal - Boulder Badge"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.first_battle:
-            return 0.0
+        prev_badges = action.get("prev_badges", 0)
+        current_badges = state.get("badges", 0)
-        prev_in_battle = action.get("prev_in_battle", False)
-        current_in_battle = state["in_battle"]
+        # Check if Boulder Badge (bit 0) was newly earned
+        boulder_badge_mask = 0x01
+        prev_has_badge = prev_badges & boulder_badge_mask
+        current_has_badge = current_badges & boulder_badge_mask
+        if not prev_has_badge and current_has_badge:
+            return 50.0  # MASSIVE reward for completing the main objective
-        if not prev_in_battle and current_in_battle:
-            self.first_battle = True
-            return 5.0
         return 0.0
-class DirectionExplorationReward(RewardComponent):
-    """Reward for trying all movement directions - +1.0 points when complete"""
+class StepPenaltyComponent(RewardComponent):
+    """Small penalty for each step to encourage efficiency"""
-    def __init__(self):
-        self.directions_tried: Set[str] = set()
-        self.reward_given = False
+    def __init__(self, penalty: float = 0.0):  # Changed from -0.005 to 0.0
+        self.penalty = penalty
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.reward_given:
-            return 0.0
+        return self.penalty
-        # Track movement directions based on position changes
-        prev_x = action.get("prev_player_x", state["player_x"])
-        prev_y = action.get("prev_player_y", state["player_y"])
-        current_x = state["player_x"]
-        current_y = state["player_y"]
-        if current_x > prev_x:
-            self.directions_tried.add("RIGHT")
-        elif current_x < prev_x:
-            self.directions_tried.add("LEFT")
-        elif current_y > prev_y:
-            self.directions_tried.add("DOWN")
-        elif current_y < prev_y:
-            self.directions_tried.add("UP")
-        if len(self.directions_tried) >= 4:
-            self.reward_given = True
-            return 1.0
-        return 0.0
+# ===== LEGACY COMPONENTS (kept for compatibility) =====
-class BuildingExplorationReward(RewardComponent):
-    """Reward for entering different buildings - +0.5 points per building"""
-    def __init__(self):
-        self.buildings_entered: Set[int] = set()
+class BadgeRewardComponent(RewardComponent):
+    """Legacy badge reward - now handled by BadgeVictoryReward"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        prev_map = action.get("prev_map_id", -1)
-        current_map = state["map_id"]
+        return 0.0  # Handled by BadgeVictoryReward
-        # Entering a new building from town
-        if (
-            prev_map == 0 and current_map > 0 and current_map not in [1, 2]
-        ):  # From town to new building
-            if current_map not in self.buildings_entered:
-                self.buildings_entered.add(current_map)
-                return 0.5
-        return 0.0
+class MapTransitionComponent(RewardComponent):
+    """Legacy map transition - now handled by RouteExplorationReward"""
+    async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
+        return 0.0  # Handled by RouteExplorationReward
-class ObjectInteractionReward(RewardComponent):
-    """Reward for pressing A on various objects - +0.3 points per object"""
-    def __init__(self):
-        self.objects_interacted: Set[tuple] = set()
+class BattleVictoryComponent(RewardComponent):
+    """Legacy battle victory - now handled by BattleProgressionReward"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        # Detect A button interactions that trigger text
-        if state["text_box_active"] and not action.get("prev_text_box_active", False):
-            object_key = (state["player_x"], state["player_y"], state["map_id"])
-            if object_key not in self.objects_interacted:
-                self.objects_interacted.add(object_key)
-                return 0.3
-        return 0.0
+        return 0.0  # Handled by BattleProgressionReward
-class TownExplorationReward(RewardComponent):
-    """Reward for thorough town exploration - +0.1 per new position"""
-    def __init__(self):
-        self.positions_visited: Set[tuple] = set()
+class LevelUpComponent(RewardComponent):
+    """Legacy level up - now handled by StrategicTrainingReward"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if state["map_id"] == 0:  # In Pallet Town
-            position_key = (state["player_x"], state["player_y"])
-            if position_key not in self.positions_visited:
-                self.positions_visited.add(position_key)
-                return 0.1
-        return 0.0
+        return 0.0  # Handled by StrategicTrainingReward
-class RouteAttemptReward(RewardComponent):
-    """Reward for trying to leave town (triggers story) - +3.0 points"""
-    def __init__(self):
-        self.route_attempted = False
+class XPGainComponent(RewardComponent):
+    """Legacy XP gain - now handled by StrategicTrainingReward"""
     async def score(self, state: Dict[str, Any], action: Dict[str, Any]) -> float:
-        if self.route_attempted:
-            return 0.0
-        # Detect reaching the edge of Pallet Town (attempting to go north)
-        if state["map_id"] == 0:  # In Pallet Town
-            if state["player_y"] <= 1:  # At northern edge
-                self.route_attempted = True
-                return 3.0
-        return 0.0
+        return 0.0  # Handled by StrategicTrainingReward

synth-ai 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl