PyPI - cogames-agents - Versions diffs - 0.0.0.7__cp312-cp312-macosx_11_0_arm64.whl - Mend

cogames-agents 0.0.0.7__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

cogames_agents/policy/scripted_agent/cogsguard/targeted_agent.py ADDED Viewed

@@ -0,0 +1,418 @@
+"""CoGsGuard scripted policy with targeted role assignments."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from cogames_agents.policy.scripted_agent.utils import change_vibe_action
+from mettagrid.policy.policy import StatefulAgentPolicy
+from mettagrid.policy.policy_env_interface import PolicyEnvInterface
+from mettagrid.simulator import Action
+from .aligner import AlignerAgentPolicyImpl
+from .miner import HEALING_AOE_RANGE, MinerAgentPolicyImpl
+from .policy import DEBUG, CogsguardAgentPolicyImpl, CogsguardMultiRoleImpl, CogsguardPolicy
+from .scout import ScoutAgentPolicyImpl
+from .scrambler import ScramblerAgentPolicyImpl
+from .types import CogsguardAgentState, Role, StructureInfo, StructureType
+PLAN_INTERVAL_STEPS = 25
+PHASE_EXPLORE_END = 80
+PHASE_CONTROL_END = 260
+CHEST_LOW_THRESHOLD = 60
+CONTROL_VIBES = {"scrambler", "aligner"}
+RESOURCE_CYCLE = ["carbon", "oxygen", "germanium", "silicon"]
+def _default_role_counts(num_agents: int) -> dict[str, int]:
+    if num_agents <= 1:
+        return {"miner": 1}
+    if num_agents == 2:
+        return {"scrambler": 1, "miner": 1}
+    if num_agents == 3:
+        return {"scrambler": 1, "miner": 1, "scout": 1}
+    if num_agents <= 7:
+        scramblers = 1
+        aligners = 1
+        scouts = 1
+    else:
+        scramblers = max(2, num_agents // 6)
+        aligners = max(2, num_agents // 6)
+        scouts = 1
+    miners = max(1, num_agents - scramblers - scouts - aligners)
+    return {
+        "scrambler": scramblers,
+        "aligner": aligners,
+        "miner": miners,
+        "scout": scouts,
+    }
+def _normalize_counts(num_agents: int, counts: dict[str, int]) -> dict[str, int]:
+    normalized = {k: v for k, v in counts.items() if isinstance(v, int)}
+    total = sum(normalized.values())
+    if total < num_agents:
+        normalized["miner"] = normalized.get("miner", 0) + (num_agents - total)
+    elif total > num_agents:
+        overflow = total - num_agents
+        miners = normalized.get("miner", 0)
+        normalized["miner"] = max(0, miners - overflow)
+    return normalized
+def _build_role_plan(num_agents: int, counts: dict[str, int]) -> list[str]:
+    ordered: list[str] = []
+    for role_name in ["scrambler", "aligner", "miner", "scout"]:
+        ordered.extend([role_name] * counts.get(role_name, 0))
+    if len(ordered) < num_agents:
+        ordered.extend(["miner"] * (num_agents - len(ordered)))
+    return ordered[:num_agents]
+@dataclass
+class TargetedPlannerState:
+    num_agents: int
+    desired_vibes: list[str] = field(default_factory=list)
+    last_plan_step: int = 0
+    known_junctions: int = 0
+    aligned_junctions: int = 0
+    chest_resources: int = 0
+    junction_map: dict[tuple[int, int], Optional[str]] = field(default_factory=dict)
+    extractor_map: dict[tuple[int, int], Optional[str]] = field(default_factory=dict)
+    assigned_junctions: dict[int, tuple[int, int]] = field(default_factory=dict)
+    assigned_extractors: dict[int, tuple[int, int]] = field(default_factory=dict)
+    def update_from_agent(self, s: CogsguardAgentState) -> None:
+        junctions = s.get_structures_by_type(StructureType.CHARGER)
+        aligned = [c for c in junctions if c.alignment == "cogs"]
+        self.known_junctions = max(self.known_junctions, len(junctions))
+        self.aligned_junctions = max(self.aligned_junctions, len(aligned))
+        for junction in junctions:
+            self.junction_map[junction.position] = junction.alignment
+        for extractor in s.get_usable_extractors():
+            self.extractor_map[extractor.position] = extractor.resource_type
+        chest_resources = 0
+        for struct in s.get_structures_by_type(StructureType.CHEST):
+            chest_resources = max(chest_resources, struct.inventory_amount)
+        if chest_resources > 0:
+            self.chest_resources = max(self.chest_resources, chest_resources)
+    def maybe_plan(self, step_count: int) -> None:
+        if step_count - self.last_plan_step < PLAN_INTERVAL_STEPS:
+            return
+        self.last_plan_step = step_count
+        counts = self._choose_counts(step_count)
+        self.desired_vibes = _build_role_plan(self.num_agents, counts)
+        self._assign_targets()
+        if DEBUG:
+            print(
+                f"[TARGETED] plan@{step_count}: junctions={self.known_junctions} "
+                f"aligned={self.aligned_junctions} chest={self.chest_resources} "
+                f"roles={counts}"
+            )
+    def _choose_counts(self, step_count: int) -> dict[str, int]:
+        if step_count < PHASE_EXPLORE_END or self.known_junctions == 0:
+            scouts = 3 if self.num_agents >= 8 else 2 if self.num_agents >= 5 else 1
+            return {
+                "scrambler": 0,
+                "aligner": 0,
+                "scout": scouts,
+                "miner": max(1, self.num_agents - scouts),
+            }
+        if 0 < self.chest_resources < CHEST_LOW_THRESHOLD:
+            scramblers = 1
+            aligners = 1
+            scouts = 1
+            return {
+                "scrambler": scramblers,
+                "aligner": aligners,
+                "scout": scouts,
+                "miner": max(1, self.num_agents - (scramblers + aligners + scouts)),
+            }
+        if step_count < PHASE_CONTROL_END and self.aligned_junctions < max(1, self.known_junctions // 2):
+            if self.num_agents >= 8:
+                scramblers = 2
+                aligners = 3
+            elif self.num_agents >= 6:
+                scramblers = 1
+                aligners = 2
+            else:
+                scramblers = 1
+                aligners = 1
+            return {
+                "scrambler": scramblers,
+                "aligner": aligners,
+                "scout": 1,
+                "miner": max(1, self.num_agents - (scramblers + aligners + 1)),
+            }
+        return {
+            "scrambler": 1,
+            "aligner": 2 if self.num_agents >= 6 else 1,
+            "scout": 1,
+            "miner": max(1, self.num_agents - (2 if self.num_agents >= 6 else 1) - 2),
+        }
+    def _assign_targets(self) -> None:
+        junctions = [pos for pos, alignment in self.junction_map.items() if alignment != "cogs"]
+        junctions.sort()
+        extractors_by_resource: dict[str, list[tuple[int, int]]] = {res: [] for res in RESOURCE_CYCLE}
+        for pos, resource in self.extractor_map.items():
+            if resource in extractors_by_resource:
+                extractors_by_resource[resource].append(pos)
+        for positions in extractors_by_resource.values():
+            positions.sort()
+        all_extractors = sorted(self.extractor_map.keys())
+        self.assigned_junctions.clear()
+        self.assigned_extractors.clear()
+        if not self.desired_vibes:
+            return
+        junction_index = 0
+        extractor_index = 0
+        for agent_id, vibe in enumerate(self.desired_vibes):
+            if vibe in CONTROL_VIBES and junctions:
+                self.assigned_junctions[agent_id] = junctions[junction_index]
+                junction_index = (junction_index + 1) % len(junctions)
+            elif vibe == "miner" and self.extractor_map:
+                preferred = RESOURCE_CYCLE[agent_id % len(RESOURCE_CYCLE)]
+                preferred_list = extractors_by_resource.get(preferred, [])
+                if preferred_list:
+                    self.assigned_extractors[agent_id] = preferred_list[extractor_index % len(preferred_list)]
+                elif all_extractors:
+                    self.assigned_extractors[agent_id] = all_extractors[extractor_index % len(all_extractors)]
+                extractor_index += 1
+class TargetedMultiRoleImpl(CogsguardMultiRoleImpl):
+    def __init__(
+        self,
+        policy_env_info: PolicyEnvInterface,
+        agent_id: int,
+        initial_target_vibe: Optional[str],
+        shared_state: TargetedPlannerState,
+    ):
+        super().__init__(policy_env_info, agent_id, initial_target_vibe=initial_target_vibe)
+        self._shared_state = shared_state
+    def _execute_phase(self, s: CogsguardAgentState) -> Action:
+        self._shared_state.update_from_agent(s)
+        if s.agent_id == 0:
+            self._shared_state.maybe_plan(s.step_count)
+        if self._shared_state.desired_vibes:
+            desired = self._shared_state.desired_vibes[s.agent_id]
+            if desired != s.current_vibe:
+                return change_vibe_action(desired, action_names=self._action_names)
+        return super()._execute_phase(s)
+    def execute_role(self, s: CogsguardAgentState) -> Action:
+        target = self._shared_state.assigned_junctions.get(s.agent_id)
+        if target and s.current_vibe in CONTROL_VIBES and s.has_gear() and s.heart >= 1:
+            struct = s.get_structure_at(target)
+            if struct and struct.alignment != "cogs":
+                if abs(target[0] - s.row) + abs(target[1] - s.col) > 1:
+                    return self._move_towards(s, target, reach_adjacent=True)
+                return self._use_object_at(s, target)
+        return super().execute_role(s)
+    def _get_role_impl(self, role: Role) -> CogsguardAgentPolicyImpl:
+        if role not in self._role_impls:
+            impl_class = {
+                Role.MINER: TargetedMinerAgentPolicyImpl,
+                Role.SCOUT: ScoutAgentPolicyImpl,
+                Role.ALIGNER: TargetedAlignerAgentPolicyImpl,
+                Role.SCRAMBLER: TargetedScramblerAgentPolicyImpl,
+            }[role]
+            if role == Role.MINER:
+                self._role_impls[role] = impl_class(self._policy_env_info, self._agent_id, role, self._shared_state)
+            else:
+                self._role_impls[role] = impl_class(self._policy_env_info, self._agent_id, role)
+        return self._role_impls[role]
+class TargetedScramblerAgentPolicyImpl(ScramblerAgentPolicyImpl):
+    def _find_best_target(self, s: CogsguardAgentState) -> Optional[tuple[int, int]]:
+        junctions = s.get_structures_by_type(StructureType.CHARGER)
+        cooldown = 20 if len(junctions) <= 4 else 50
+        enemy_junctions: list[tuple[int, tuple[int, int]]] = []
+        neutral_junctions: list[tuple[int, tuple[int, int]]] = []
+        any_junctions: list[tuple[int, tuple[int, int]]] = []
+        for junction in junctions:
+            pos = junction.position
+            dist = abs(pos[0] - s.row) + abs(pos[1] - s.col)
+            last_worked = s.worked_junctions.get(pos, 0)
+            if last_worked > 0 and s.step_count - last_worked < cooldown:
+                continue
+            if junction.alignment == "cogs":
+                continue
+            if junction.alignment == "clips" or junction.clipped:
+                enemy_junctions.append((dist, pos))
+            elif junction.alignment is None or junction.alignment == "neutral":
+                neutral_junctions.append((dist, pos))
+            else:
+                any_junctions.append((dist, pos))
+        if enemy_junctions:
+            enemy_junctions.sort()
+            return enemy_junctions[0][1]
+        if neutral_junctions:
+            neutral_junctions.sort()
+            return neutral_junctions[0][1]
+        if any_junctions:
+            any_junctions.sort()
+            return any_junctions[0][1]
+        return super()._find_best_target(s)
+class TargetedAlignerAgentPolicyImpl(AlignerAgentPolicyImpl):
+    def _find_best_target(self, s: CogsguardAgentState) -> Optional[tuple[int, int]]:
+        junctions = s.get_structures_by_type(StructureType.CHARGER)
+        cooldown = 20 if len(junctions) <= 4 else 50
+        neutral_junctions: list[tuple[int, tuple[int, int]]] = []
+        clips_junctions: list[tuple[int, tuple[int, int]]] = []
+        other_junctions: list[tuple[int, tuple[int, int]]] = []
+        for junction in junctions:
+            pos = junction.position
+            dist = abs(pos[0] - s.row) + abs(pos[1] - s.col)
+            last_worked = s.worked_junctions.get(pos, 0)
+            if last_worked > 0 and s.step_count - last_worked < cooldown:
+                continue
+            if junction.alignment == "cogs":
+                continue
+            if junction.alignment is None or junction.alignment == "neutral":
+                neutral_junctions.append((dist, pos))
+            elif junction.alignment == "clips" or junction.clipped:
+                clips_junctions.append((dist, pos))
+            else:
+                other_junctions.append((dist, pos))
+        if neutral_junctions:
+            neutral_junctions.sort()
+            return neutral_junctions[0][1]
+        if clips_junctions:
+            clips_junctions.sort()
+            return clips_junctions[0][1]
+        if other_junctions:
+            other_junctions.sort()
+            return other_junctions[0][1]
+        return super()._find_best_target(s)
+class TargetedMinerAgentPolicyImpl(MinerAgentPolicyImpl):
+    def __init__(
+        self,
+        policy_env_info: PolicyEnvInterface,
+        agent_id: int,
+        role: Role,
+        shared_state: TargetedPlannerState,
+    ):
+        super().__init__(policy_env_info, agent_id, role)
+        self._shared_state = shared_state
+        self._preferred_resource = RESOURCE_CYCLE[agent_id % len(RESOURCE_CYCLE)]
+    def _get_safe_extractor(
+        self,
+        s: CogsguardAgentState,
+        preferred_resource: str | None = None,
+    ) -> Optional[StructureInfo]:
+        target = self._shared_state.assigned_extractors.get(s.agent_id)
+        if target:
+            current = s.get_structure_at(target)
+            if current and current.is_usable_extractor():
+                max_safe_dist = self._get_max_safe_distance(s)
+                dist_to_ext = abs(target[0] - s.row) + abs(target[1] - s.col)
+                nearest_depot = self._get_nearest_aligned_depot(s)
+                if nearest_depot:
+                    dist_ext_to_depot = abs(target[0] - nearest_depot[0]) + abs(target[1] - nearest_depot[1])
+                    round_trip = dist_to_ext + max(0, dist_ext_to_depot - HEALING_AOE_RANGE)
+                else:
+                    round_trip = dist_to_ext * 2
+                if round_trip <= max_safe_dist:
+                    return current
+        resource = preferred_resource or self._preferred_resource
+        preferred = [ext for ext in s.get_usable_extractors() if ext.resource_type == resource]
+        if preferred:
+            nearest_depot = self._get_nearest_aligned_depot(s)
+            max_safe_dist = self._get_max_safe_distance(s)
+            candidates: list[tuple[int, int, int, StructureInfo]] = []
+            for ext in preferred:
+                dist_to_ext = abs(ext.position[0] - s.row) + abs(ext.position[1] - s.col)
+                if nearest_depot:
+                    dist_ext_to_depot = abs(ext.position[0] - nearest_depot[0]) + abs(
+                        ext.position[1] - nearest_depot[1]
+                    )
+                    round_trip = dist_to_ext + max(0, dist_ext_to_depot - HEALING_AOE_RANGE)
+                else:
+                    round_trip = dist_to_ext * 2
+                if round_trip <= max_safe_dist:
+                    dist_ext_to_depot = (
+                        abs(ext.position[0] - nearest_depot[0]) + abs(ext.position[1] - nearest_depot[1])
+                        if nearest_depot
+                        else 100
+                    )
+                    candidates.append((ext.inventory_amount, dist_ext_to_depot, dist_to_ext, ext))
+            if candidates:
+                candidates.sort(key=lambda x: (-x[0], x[1], x[2]))
+                return candidates[0][3]
+        return super()._get_safe_extractor(s, preferred_resource=preferred_resource)
+class CogsguardTargetedAgent(CogsguardPolicy):
+    """CoGsGuard policy with coordinated role and target assignment."""
+    short_names = ["cogsguard_targeted"]
+    def __init__(
+        self,
+        policy_env_info: PolicyEnvInterface,
+        device: str = "cpu",
+        **vibe_counts: Any,
+    ):
+        has_explicit_counts = any(isinstance(v, int) for v in vibe_counts.values())
+        if has_explicit_counts:
+            counts = _normalize_counts(policy_env_info.num_agents, vibe_counts)
+        else:
+            counts = _default_role_counts(policy_env_info.num_agents)
+        super().__init__(policy_env_info, device=device, **counts)
+        self._shared_state = TargetedPlannerState(policy_env_info.num_agents)
+        self._shared_state.desired_vibes = _build_role_plan(policy_env_info.num_agents, counts)
+    def agent_policy(self, agent_id: int) -> StatefulAgentPolicy[CogsguardAgentState]:
+        if agent_id not in self._agent_policies:
+            target_vibe = None
+            if agent_id < len(self._initial_vibes):
+                target_vibe = self._initial_vibes[agent_id]
+            impl = TargetedMultiRoleImpl(
+                self._policy_env_info,
+                agent_id,
+                initial_target_vibe=target_vibe,
+                shared_state=self._shared_state,
+            )
+            self._agent_policies[agent_id] = StatefulAgentPolicy(impl, self._policy_env_info, agent_id=agent_id)
+        return self._agent_policies[agent_id]

cogames_agents/policy/scripted_agent/cogsguard/teacher.py ADDED Viewed

@@ -0,0 +1,224 @@
+from __future__ import annotations
+from typing import Optional, Sequence
+import numpy as np
+from cogames_agents.policy.nim_agents.agents import CogsguardAgentsMultiPolicy
+from cogames_agents.policy.scripted_agent.cogsguard.types import Role as CogsguardRole
+from cogames_agents.policy.scripted_agent.common.roles import ROLE_VIBES
+from mettagrid.policy.policy import AgentPolicy, MultiAgentPolicy
+from mettagrid.policy.policy_env_interface import PolicyEnvInterface
+from mettagrid.simulator import Action, AgentObservation
+DEFAULT_ROLE_VIBES = tuple(ROLE_VIBES)
+class CogsguardTeacherPolicy(MultiAgentPolicy):
+    """Teacher wrapper that forces an initial vibe, then delegates to the Nim policy."""
+    short_names = ["teacher"]
+    def __init__(
+        self,
+        policy_env_info: PolicyEnvInterface,
+        device: str = "cpu",
+        role_vibes: Optional[Sequence[str | CogsguardRole] | str] = None,
+    ) -> None:
+        super().__init__(policy_env_info, device=device)
+        self._delegate = CogsguardAgentsMultiPolicy(policy_env_info)
+        self._num_agents = policy_env_info.num_agents
+        self._action_names = list(policy_env_info.action_names)
+        self._action_name_to_index = {name: idx for idx, name in enumerate(self._action_names)}
+        self._delegate_agents = [self._delegate.agent_policy(i) for i in range(self._num_agents)]
+        self._episode_feature_id = self._find_feature_id("episode_completion_pct")
+        self._last_action_feature_id = self._find_feature_id("last_action")
+        self._role_action_ids = self._resolve_role_actions(role_vibes)
+        self._reset_episode_state()
+    def agent_policy(self, agent_id: int) -> AgentPolicy:
+        return _CogsguardTeacherAgentPolicy(self, agent_id)
+    def reset(self) -> None:
+        self._delegate.reset()
+        self._reset_episode_state()
+    def step_batch(self, raw_observations: np.ndarray, raw_actions: np.ndarray) -> None:
+        self._delegate.step_batch(raw_observations, raw_actions)
+        if not self._role_action_ids:
+            return
+        if raw_observations.shape[0] != self._num_agents:
+            return
+        for agent_id in range(self._num_agents):
+            episode_pct = self._extract_episode_pct_raw(raw_observations[agent_id])
+            last_action = self._extract_last_action_raw(raw_observations[agent_id])
+            forced_action = self._maybe_force_action(agent_id, episode_pct, last_action)
+            if forced_action is not None:
+                raw_actions[agent_id] = forced_action
+    def _step_single(self, agent_id: int, obs: AgentObservation) -> Action:
+        base_action = self._delegate_agents[agent_id].step(obs)
+        if not self._role_action_ids:
+            return base_action
+        episode_pct = self._extract_episode_pct_obs(obs)
+        last_action = self._extract_last_action_obs(obs)
+        forced_action = self._maybe_force_action(agent_id, episode_pct, last_action)
+        if forced_action is None:
+            return base_action
+        action_name = self._action_names[forced_action]
+        return Action(name=action_name)
+    def _extract_episode_pct_raw(self, raw_obs: np.ndarray) -> Optional[int]:
+        if self._episode_feature_id is None:
+            return None
+        for token in raw_obs:
+            if token[0] == 255 and token[1] == 255 and token[2] == 255:
+                break
+            if token[1] == self._episode_feature_id:
+                return int(token[2])
+        return 0
+    def _extract_episode_pct_obs(self, obs: AgentObservation) -> Optional[int]:
+        if self._episode_feature_id is None:
+            return None
+        for token in obs.tokens:
+            if token.feature.name == "episode_completion_pct":
+                return token.value
+        return 0
+    def _extract_last_action_raw(self, raw_obs: np.ndarray) -> Optional[int]:
+        if self._last_action_feature_id is None:
+            return None
+        for token in raw_obs:
+            if token[0] == 255 and token[1] == 255 and token[2] == 255:
+                break
+            if token[1] == self._last_action_feature_id:
+                return int(token[2])
+        return 0
+    def _extract_last_action_obs(self, obs: AgentObservation) -> Optional[int]:
+        if self._last_action_feature_id is None:
+            return None
+        for token in obs.tokens:
+            if token.feature.name == "last_action":
+                return token.value
+        return 0
+    def _find_feature_id(self, feature_name: str) -> Optional[int]:
+        for feature in self.policy_env_info.obs_features:
+            if feature.name == feature_name:
+                return feature.id
+        return None
+    def _resolve_role_actions(self, role_vibes: Optional[Sequence[str | CogsguardRole] | str]) -> list[int]:
+        change_vibe_actions = [name for name in self._action_names if name.startswith("change_vibe_")]
+        if len(change_vibe_actions) <= 1:
+            return []
+        available_vibes = [name[len("change_vibe_") :] for name in change_vibe_actions]
+        if role_vibes is None:
+            role_vibes = [vibe for vibe in DEFAULT_ROLE_VIBES if vibe in available_vibes]
+            if not role_vibes:
+                role_vibes = [vibe for vibe in available_vibes if vibe != "default"]
+            if not role_vibes:
+                role_vibes = available_vibes
+        else:
+            if isinstance(role_vibes, str):
+                normalized_vibes = [vibe.strip() for vibe in role_vibes.split(",") if vibe.strip()]
+            else:
+                normalized_vibes = [vibe.value if isinstance(vibe, CogsguardRole) else str(vibe) for vibe in role_vibes]
+            role_vibes = [vibe for vibe in normalized_vibes if vibe in available_vibes]
+            if not role_vibes:
+                role_vibes = available_vibes
+        role_action_ids = []
+        for vibe_name in role_vibes:
+            action_name = f"change_vibe_{vibe_name}"
+            action_id = self._action_name_to_index.get(action_name)
+            if action_id is not None:
+                role_action_ids.append(action_id)
+        return role_action_ids
+    def _reset_episode_state(self) -> None:
+        self._episode_index = [0] * self._num_agents
+        self._forced_vibe = [False] * self._num_agents
+        self._last_episode_pct = [-1] * self._num_agents
+        self._step_in_episode = [0] * self._num_agents
+        self._last_action_value: list[Optional[int]] = [None] * self._num_agents
+    def _maybe_force_action(
+        self,
+        agent_id: int,
+        episode_pct: Optional[int],
+        last_action: Optional[int],
+    ) -> Optional[int]:
+        self._update_episode_state(agent_id, episode_pct, last_action)
+        if self._forced_vibe[agent_id] or self._step_in_episode[agent_id] != 0:
+            return None
+        self._forced_vibe[agent_id] = True
+        role_index = (self._episode_index[agent_id] + agent_id) % len(self._role_action_ids)
+        return self._role_action_ids[role_index]
+    def _update_episode_state(
+        self,
+        agent_id: int,
+        episode_pct: Optional[int],
+        last_action: Optional[int],
+    ) -> None:
+        last_pct = self._last_episode_pct[agent_id]
+        if episode_pct is None:
+            last_action_seen = self._last_action_value[agent_id]
+            if (
+                last_action is not None
+                and last_action_seen is not None
+                and last_action == 0
+                and last_action_seen != 0
+                and self._step_in_episode[agent_id] > 0
+            ):
+                self._episode_index[agent_id] += 1
+                self._step_in_episode[agent_id] = 0
+                self._forced_vibe[agent_id] = False
+                self._last_episode_pct[agent_id] = 0
+                self._last_action_value[agent_id] = last_action
+                return
+            if last_pct == -1:
+                self._step_in_episode[agent_id] = 0
+            else:
+                self._step_in_episode[agent_id] += 1
+            self._last_episode_pct[agent_id] = 0
+            if last_action is not None:
+                self._last_action_value[agent_id] = last_action
+            return
+        new_episode = False
+        if last_pct == -1:
+            new_episode = True
+        elif episode_pct < last_pct:
+            new_episode = True
+        elif last_pct > 0 and episode_pct == 0:
+            new_episode = True
+        if new_episode:
+            if last_pct != -1:
+                self._episode_index[agent_id] += 1
+            self._step_in_episode[agent_id] = 0
+            self._forced_vibe[agent_id] = False
+        else:
+            self._step_in_episode[agent_id] += 1
+        self._last_episode_pct[agent_id] = episode_pct
+        if last_action is not None:
+            self._last_action_value[agent_id] = last_action
+class _CogsguardTeacherAgentPolicy(AgentPolicy):
+    def __init__(self, parent: CogsguardTeacherPolicy, agent_id: int) -> None:
+        super().__init__(parent.policy_env_info)
+        self._parent = parent
+        self._agent_id = agent_id
+    def step(self, obs: AgentObservation) -> Action:
+        return self._parent._step_single(self._agent_id, obs)