phantomrt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atlas/__init__.py +3 -0
- atlas/agents/__init__.py +8 -0
- atlas/agents/command_space.py +227 -0
- atlas/analysis/__init__.py +3 -0
- atlas/analysis/binary_agent.py +488 -0
- atlas/analysis/binary_fuzz.py +389 -0
- atlas/analysis/frida_live.py +261 -0
- atlas/analysis/graph_annotator.py +147 -0
- atlas/analysis/spectrida_bridge.py +84 -0
- atlas/analysis/unicorn_harness.py +337 -0
- atlas/core/__init__.py +14 -0
- atlas/core/decoder.py +65 -0
- atlas/core/dynamics.py +217 -0
- atlas/core/encoder.py +120 -0
- atlas/core/surprise.py +145 -0
- atlas/core/world_model.py +334 -0
- atlas/environments/__init__.py +5 -0
- atlas/environments/base.py +51 -0
- atlas/environments/grid_world.py +219 -0
- atlas/environments/physics_2d.py +283 -0
- atlas/environments/vm_world.py +168 -0
- atlas/knowledge/__init__.py +3 -0
- atlas/knowledge/instruction_vocab.py +534 -0
- atlas/monitor/__init__.py +5 -0
- atlas/monitor/execution_monitor.py +518 -0
- atlas/optimization/__init__.py +6 -0
- atlas/optimization/speed.py +457 -0
- atlas/planning/__init__.py +4 -0
- atlas/planning/goal.py +100 -0
- atlas/planning/mcts.py +228 -0
- atlas/training/__init__.py +4 -0
- atlas/training/continual.py +392 -0
- atlas/training/growth.py +213 -0
- atlas/training/loop.py +306 -0
- atlas/training/losses.py +101 -0
- atlas/training/self_train.py +307 -0
- atlas/utils/__init__.py +4 -0
- atlas/utils/logging.py +33 -0
- atlas/utils/math_helpers.py +30 -0
- atlas/utils/viz.py +136 -0
- atlas/vm/__init__.py +4 -0
- atlas/vm/wsl_vm.py +249 -0
- phantomrt-0.1.0.dist-info/METADATA +75 -0
- phantomrt-0.1.0.dist-info/RECORD +48 -0
- phantomrt-0.1.0.dist-info/WHEEL +5 -0
- phantomrt-0.1.0.dist-info/entry_points.txt +3 -0
- phantomrt-0.1.0.dist-info/licenses/LICENSE +21 -0
- phantomrt-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""
|
|
2
|
+
World Model: The Complete Brain
|
|
3
|
+
|
|
4
|
+
Combines encoder, dynamics, and decoder into one coherent system
|
|
5
|
+
that can:
|
|
6
|
+
1. Encode observations into understanding
|
|
7
|
+
2. Simulate the future in imagination
|
|
8
|
+
3. Predict what it would see
|
|
9
|
+
4. Detect surprises and learn from them
|
|
10
|
+
5. Plan actions by imagining outcomes
|
|
11
|
+
|
|
12
|
+
This is the central class that everything else interacts with.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import torch
|
|
16
|
+
import torch.nn as nn
|
|
17
|
+
import torch.nn.functional as F
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from .encoder import Encoder
|
|
22
|
+
from .decoder import Decoder
|
|
23
|
+
from .dynamics import DynamicsFunction, NeuralODE
|
|
24
|
+
from .surprise import SurpriseDetector
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class WorldModelOutput:
|
|
29
|
+
"""Container for world model outputs."""
|
|
30
|
+
# Current state
|
|
31
|
+
latent_state: torch.Tensor # [batch, latent_dim]
|
|
32
|
+
encoder_mean: torch.Tensor # [batch, latent_dim]
|
|
33
|
+
encoder_log_var: torch.Tensor # [batch, latent_dim]
|
|
34
|
+
|
|
35
|
+
# Reconstruction
|
|
36
|
+
reconstructed_obs: torch.Tensor # [batch, obs_dim]
|
|
37
|
+
|
|
38
|
+
# Losses
|
|
39
|
+
reconstruction_loss: torch.Tensor # scalar
|
|
40
|
+
kl_loss: torch.Tensor # scalar
|
|
41
|
+
surprise_loss: torch.Tensor # scalar
|
|
42
|
+
total_loss: torch.Tensor # scalar
|
|
43
|
+
|
|
44
|
+
# Surprise info
|
|
45
|
+
is_surprising: bool
|
|
46
|
+
surprise_score: float
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class RolloutOutput:
|
|
51
|
+
"""Container for imagined trajectory outputs."""
|
|
52
|
+
trajectory: torch.Tensor # [batch, steps+1, latent_dim]
|
|
53
|
+
predicted_observations: torch.Tensor # [batch, steps+1, obs_dim]
|
|
54
|
+
rewards: Optional[torch.Tensor] # [batch, steps+1] if reward predictor exists
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class WorldModel(nn.Module):
|
|
58
|
+
"""
|
|
59
|
+
The complete world model brain.
|
|
60
|
+
|
|
61
|
+
Architecture:
|
|
62
|
+
observation → [Encoder] → latent_state
|
|
63
|
+
↓
|
|
64
|
+
[Neural ODE Dynamics] → future_state
|
|
65
|
+
↓
|
|
66
|
+
[Decoder] → predicted_observation
|
|
67
|
+
↓
|
|
68
|
+
[Surprise] → learn or confirm
|
|
69
|
+
|
|
70
|
+
Key capabilities:
|
|
71
|
+
- encode(): understand what we're seeing
|
|
72
|
+
- imagine(): simulate future states
|
|
73
|
+
- predict(): see what we'd observe from a state
|
|
74
|
+
- plan(): find best action by imagining outcomes
|
|
75
|
+
- learn(): update from surprising experiences
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
obs_dim: int,
|
|
81
|
+
action_dim: int,
|
|
82
|
+
latent_dim: int = 256,
|
|
83
|
+
hidden_dim: int = 512,
|
|
84
|
+
encoder_hidden_dims: list = None,
|
|
85
|
+
decoder_hidden_dims: list = None,
|
|
86
|
+
dynamics_layers: int = 3,
|
|
87
|
+
dynamics_solver: str = "dopri5",
|
|
88
|
+
dynamics_dt: float = 0.05,
|
|
89
|
+
dropout: float = 0.1,
|
|
90
|
+
surprise_threshold: float = 0.1,
|
|
91
|
+
):
|
|
92
|
+
super().__init__()
|
|
93
|
+
|
|
94
|
+
self.obs_dim = obs_dim
|
|
95
|
+
self.action_dim = action_dim
|
|
96
|
+
self.latent_dim = latent_dim
|
|
97
|
+
|
|
98
|
+
if encoder_hidden_dims is None:
|
|
99
|
+
encoder_hidden_dims = [hidden_dim, hidden_dim]
|
|
100
|
+
if decoder_hidden_dims is None:
|
|
101
|
+
decoder_hidden_dims = [hidden_dim, hidden_dim]
|
|
102
|
+
|
|
103
|
+
# === Core Components ===
|
|
104
|
+
self.encoder = Encoder(
|
|
105
|
+
obs_dim=obs_dim,
|
|
106
|
+
latent_dim=latent_dim,
|
|
107
|
+
hidden_dims=encoder_hidden_dims,
|
|
108
|
+
dropout=dropout,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
self.dynamics_fn = DynamicsFunction(
|
|
112
|
+
state_dim=latent_dim,
|
|
113
|
+
action_dim=action_dim,
|
|
114
|
+
hidden_dim=hidden_dim,
|
|
115
|
+
num_layers=dynamics_layers,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.neural_ode = NeuralODE(
|
|
119
|
+
dynamics_fn=self.dynamics_fn,
|
|
120
|
+
solver=dynamics_solver,
|
|
121
|
+
dt=dynamics_dt,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.decoder = Decoder(
|
|
125
|
+
latent_dim=latent_dim,
|
|
126
|
+
obs_dim=obs_dim,
|
|
127
|
+
hidden_dims=decoder_hidden_dims,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# === Surprise System ===
|
|
131
|
+
self.surprise_detector = SurpriseDetector(
|
|
132
|
+
initial_threshold=surprise_threshold,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# === Reward Predictor (for planning) ===
|
|
136
|
+
self.reward_predictor = nn.Sequential(
|
|
137
|
+
nn.Linear(latent_dim, hidden_dim),
|
|
138
|
+
nn.SiLU(),
|
|
139
|
+
nn.Linear(hidden_dim, 1),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def encode(self, observation: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
143
|
+
"""
|
|
144
|
+
Encode observation into latent state.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
observation: [batch, obs_dim]
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
(mean, log_var, sampled_state): each [batch, latent_dim]
|
|
151
|
+
"""
|
|
152
|
+
return self.encoder.encode(observation)
|
|
153
|
+
|
|
154
|
+
def predict(self, latent_state: torch.Tensor) -> torch.Tensor:
|
|
155
|
+
"""
|
|
156
|
+
Decode latent state into predicted observation.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
latent_state: [batch, latent_dim]
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
predicted_observation: [batch, obs_dim]
|
|
163
|
+
"""
|
|
164
|
+
return self.decoder(latent_state)
|
|
165
|
+
|
|
166
|
+
def imagine(
|
|
167
|
+
self,
|
|
168
|
+
initial_observation: torch.Tensor,
|
|
169
|
+
actions: torch.Tensor,
|
|
170
|
+
) -> RolloutOutput:
|
|
171
|
+
"""
|
|
172
|
+
Imagine a future by rolling out dynamics in latent space.
|
|
173
|
+
|
|
174
|
+
This is the CORE capability — the model simulates
|
|
175
|
+
what WOULD happen if it took certain actions,
|
|
176
|
+
WITHOUT touching the real world.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
initial_observation: [batch, obs_dim]
|
|
180
|
+
actions: [batch, num_steps, action_dim]
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
RolloutOutput with trajectory and predicted observations
|
|
184
|
+
"""
|
|
185
|
+
# Encode current observation
|
|
186
|
+
_, _, initial_state = self.encode(initial_observation)
|
|
187
|
+
|
|
188
|
+
# Roll out dynamics in latent space
|
|
189
|
+
latent_trajectory = self.neural_ode(initial_state, actions)
|
|
190
|
+
# shape: [batch, num_steps+1, latent_dim]
|
|
191
|
+
|
|
192
|
+
# Decode each state into predicted observations
|
|
193
|
+
batch_size, num_timesteps, _ = latent_trajectory.shape
|
|
194
|
+
flat_states = latent_trajectory.reshape(-1, self.latent_dim)
|
|
195
|
+
predicted_obs = self.decoder(flat_states)
|
|
196
|
+
predicted_obs = predicted_obs.reshape(batch_size, num_timesteps, self.obs_dim)
|
|
197
|
+
|
|
198
|
+
# Predict rewards for each state
|
|
199
|
+
flat_rewards = self.reward_predictor(flat_states)
|
|
200
|
+
rewards = flat_rewards.reshape(batch_size, num_timesteps, 1)
|
|
201
|
+
|
|
202
|
+
return RolloutOutput(
|
|
203
|
+
trajectory=latent_trajectory,
|
|
204
|
+
predicted_observations=predicted_obs,
|
|
205
|
+
rewards=rewards,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
def step_dynamics(
|
|
209
|
+
self,
|
|
210
|
+
latent_state: torch.Tensor,
|
|
211
|
+
action: torch.Tensor,
|
|
212
|
+
) -> torch.Tensor:
|
|
213
|
+
"""
|
|
214
|
+
Single-step dynamics prediction.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
latent_state: [batch, latent_dim]
|
|
218
|
+
action: [batch, action_dim]
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
next_state: [batch, latent_dim]
|
|
222
|
+
"""
|
|
223
|
+
return self.neural_ode.single_step(latent_state, action)
|
|
224
|
+
|
|
225
|
+
def forward(
|
|
226
|
+
self,
|
|
227
|
+
observation: torch.Tensor,
|
|
228
|
+
actions: Optional[torch.Tensor] = None,
|
|
229
|
+
rollout_steps: int = 10,
|
|
230
|
+
) -> WorldModelOutput:
|
|
231
|
+
"""
|
|
232
|
+
Full forward pass: encode → reconstruct → compute losses.
|
|
233
|
+
|
|
234
|
+
If actions are provided, also rolls out dynamics and
|
|
235
|
+
computes prediction losses.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
observation: [batch, obs_dim]
|
|
239
|
+
actions: optional [batch, rollout_steps, action_dim]
|
|
240
|
+
rollout_steps: how many steps to predict
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
WorldModelOutput with all losses and info
|
|
244
|
+
"""
|
|
245
|
+
# 1. Encode
|
|
246
|
+
mean, log_var, latent_state = self.encode(observation)
|
|
247
|
+
|
|
248
|
+
# 2. Reconstruct
|
|
249
|
+
reconstructed = self.predict(latent_state)
|
|
250
|
+
|
|
251
|
+
# 3. Compute reconstruction loss
|
|
252
|
+
recon_loss = F.mse_loss(reconstructed, observation)
|
|
253
|
+
|
|
254
|
+
# 4. KL divergence (regularize latent space)
|
|
255
|
+
kl_loss = self.encoder.kl_divergence(mean, log_var)
|
|
256
|
+
|
|
257
|
+
# 5. Surprise detection
|
|
258
|
+
surprise_loss, is_surprising = self.surprise_detector.compute_surprise(
|
|
259
|
+
observation, reconstructed
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# 6. Total loss (weighted combination)
|
|
263
|
+
total_loss = recon_loss + 0.01 * kl_loss
|
|
264
|
+
|
|
265
|
+
# If actions provided, add dynamics prediction loss
|
|
266
|
+
if actions is not None:
|
|
267
|
+
dynamics_loss = self._compute_dynamics_loss(observation, actions)
|
|
268
|
+
total_loss = total_loss + 0.1 * dynamics_loss
|
|
269
|
+
|
|
270
|
+
return WorldModelOutput(
|
|
271
|
+
latent_state=latent_state,
|
|
272
|
+
encoder_mean=mean,
|
|
273
|
+
encoder_log_var=log_var,
|
|
274
|
+
reconstructed_obs=reconstructed,
|
|
275
|
+
reconstruction_loss=recon_loss,
|
|
276
|
+
kl_loss=kl_loss,
|
|
277
|
+
surprise_loss=surprise_loss,
|
|
278
|
+
total_loss=total_loss,
|
|
279
|
+
is_surprising=is_surprising,
|
|
280
|
+
surprise_score=surprise_loss.item(),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def _compute_dynamics_loss(
|
|
284
|
+
self,
|
|
285
|
+
observation: torch.Tensor,
|
|
286
|
+
actions: torch.Tensor,
|
|
287
|
+
) -> torch.Tensor:
|
|
288
|
+
"""
|
|
289
|
+
Compute dynamics prediction loss.
|
|
290
|
+
|
|
291
|
+
Roll out the model's predictions and compare against
|
|
292
|
+
what actually happened.
|
|
293
|
+
"""
|
|
294
|
+
# Encode initial state
|
|
295
|
+
_, _, initial_state = self.encode(observation)
|
|
296
|
+
|
|
297
|
+
# Roll out dynamics
|
|
298
|
+
latent_trajectory = self.neural_ode(initial_state, actions)
|
|
299
|
+
|
|
300
|
+
# For each step, reconstruct and compare with next observation
|
|
301
|
+
# (We don't have future observations here in the simplified case,
|
|
302
|
+
# so we use self-consistency: predicted future should predict itself)
|
|
303
|
+
total_loss = torch.tensor(0.0, device=observation.device)
|
|
304
|
+
|
|
305
|
+
for t in range(latent_trajectory.shape[1] - 1):
|
|
306
|
+
current_state = latent_trajectory[:, t]
|
|
307
|
+
next_state = latent_trajectory[:, t + 1]
|
|
308
|
+
action = actions[:, t]
|
|
309
|
+
|
|
310
|
+
# Predict next state from current
|
|
311
|
+
predicted_next = self.neural_ode.single_step(current_state, action)
|
|
312
|
+
|
|
313
|
+
# Loss: predicted next should match actual next
|
|
314
|
+
step_loss = F.mse_loss(predicted_next, next_state.detach())
|
|
315
|
+
total_loss = total_loss + step_loss
|
|
316
|
+
|
|
317
|
+
return total_loss / max(latent_trajectory.shape[1] - 1, 1)
|
|
318
|
+
|
|
319
|
+
def get_latent_representation(self, observation: torch.Tensor) -> torch.Tensor:
|
|
320
|
+
"""
|
|
321
|
+
Get the deterministic latent state (mean, no sampling).
|
|
322
|
+
|
|
323
|
+
Useful for planning and visualization.
|
|
324
|
+
"""
|
|
325
|
+
mean, _ = self.encoder(observation)
|
|
326
|
+
return mean
|
|
327
|
+
|
|
328
|
+
def compute_reward(self, latent_state: torch.Tensor) -> torch.Tensor:
|
|
329
|
+
"""Predict reward for a given state."""
|
|
330
|
+
return self.reward_predictor(latent_state)
|
|
331
|
+
|
|
332
|
+
def get_surprise_stats(self) -> dict:
|
|
333
|
+
"""Get surprise detection statistics."""
|
|
334
|
+
return self.surprise_detector.get_stats()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Environment Interface
|
|
3
|
+
|
|
4
|
+
All environments inherit from this.
|
|
5
|
+
Provides a consistent API for the world model to interact with.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
import numpy as np
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseEnvironment(ABC):
|
|
14
|
+
"""Abstract base class for all environments."""
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def reset(self) -> np.ndarray:
|
|
18
|
+
"""Reset environment to initial state. Returns observation."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, dict]:
|
|
23
|
+
"""
|
|
24
|
+
Take an action in the environment.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
observation: next observation
|
|
28
|
+
reward: reward signal
|
|
29
|
+
done: whether episode ended
|
|
30
|
+
info: additional information
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_observation_dim(self) -> int:
|
|
36
|
+
"""Returns the dimensionality of observations."""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def get_action_dim(self) -> int:
|
|
41
|
+
"""Returns the dimensionality of actions."""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def render(self) -> Optional[np.ndarray]:
|
|
46
|
+
"""Render the environment. Returns RGB array or None."""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def close(self):
|
|
50
|
+
"""Clean up resources."""
|
|
51
|
+
pass
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Grid World Environment
|
|
3
|
+
|
|
4
|
+
A simple 2D grid where an agent navigates to reach a goal.
|
|
5
|
+
Objects can be placed on the grid and have basic properties.
|
|
6
|
+
|
|
7
|
+
This is the TESTBED for Phase 1 — simple enough to learn,
|
|
8
|
+
complex enough to test understanding.
|
|
9
|
+
|
|
10
|
+
Observation space: [agent_x, agent_y, goal_x, goal_y, obj1_x, obj1_y, obj1_type, ...]
|
|
11
|
+
Action space: [dx, dy] continuous movement
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from typing import Optional
|
|
16
|
+
from .base import BaseEnvironment
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GridWorld(BaseEnvironment):
|
|
20
|
+
"""
|
|
21
|
+
2D grid world with:
|
|
22
|
+
- Agent (orange) that moves around
|
|
23
|
+
- Goal (green) to reach
|
|
24
|
+
- Obstacles (red) that block movement
|
|
25
|
+
- Collectibles (blue) that give reward
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
grid_size: int = 8,
|
|
31
|
+
num_obstacles: int = 3,
|
|
32
|
+
num_collectibles: int = 2,
|
|
33
|
+
max_steps: int = 100,
|
|
34
|
+
seed: Optional[int] = None,
|
|
35
|
+
):
|
|
36
|
+
self.grid_size = grid_size
|
|
37
|
+
self.num_obstacles = num_obstacles
|
|
38
|
+
self.num_collectibles = num_collectibles
|
|
39
|
+
self.max_steps = max_steps
|
|
40
|
+
|
|
41
|
+
self.rng = np.random.RandomState(seed)
|
|
42
|
+
|
|
43
|
+
# State
|
|
44
|
+
self.agent_pos = None
|
|
45
|
+
self.goal_pos = None
|
|
46
|
+
self.obstacles = None
|
|
47
|
+
self.collectibles = None
|
|
48
|
+
self.collected = None
|
|
49
|
+
self.steps = 0
|
|
50
|
+
|
|
51
|
+
# Observation: [agent_x, agent_y, goal_x, goal_y,
|
|
52
|
+
# obs1_x, obs1_y, obs1_exists,
|
|
53
|
+
# obs2_x, obs2_y, obs2_exists, ...]
|
|
54
|
+
# For each object: x, y, exists (3 values)
|
|
55
|
+
self._obs_dim = 4 + 3 * (num_obstacles + num_collectibles)
|
|
56
|
+
self._action_dim = 2 # dx, dy
|
|
57
|
+
|
|
58
|
+
def get_observation_dim(self) -> int:
|
|
59
|
+
return self._obs_dim
|
|
60
|
+
|
|
61
|
+
def get_action_dim(self) -> int:
|
|
62
|
+
return self._action_dim
|
|
63
|
+
|
|
64
|
+
def reset(self) -> np.ndarray:
|
|
65
|
+
"""Reset to a random configuration."""
|
|
66
|
+
self.steps = 0
|
|
67
|
+
self.collected = [False] * self.num_collectibles
|
|
68
|
+
|
|
69
|
+
# Place agent at random position
|
|
70
|
+
self.agent_pos = self.rng.randint(0, self.grid_size, size=2).astype(float)
|
|
71
|
+
|
|
72
|
+
# Place goal at random position (not on agent)
|
|
73
|
+
while True:
|
|
74
|
+
self.goal_pos = self.rng.randint(0, self.grid_size, size=2).astype(float)
|
|
75
|
+
if not np.array_equal(self.agent_pos, self.goal_pos):
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
# Place obstacles
|
|
79
|
+
self.obstacles = []
|
|
80
|
+
for _ in range(self.num_obstacles):
|
|
81
|
+
while True:
|
|
82
|
+
pos = self.rng.randint(0, self.grid_size, size=2).astype(float)
|
|
83
|
+
if (not np.array_equal(pos, self.agent_pos) and
|
|
84
|
+
not np.array_equal(pos, self.goal_pos) and
|
|
85
|
+
not any(np.array_equal(pos, o) for o in self.obstacles)):
|
|
86
|
+
self.obstacles.append(pos)
|
|
87
|
+
break
|
|
88
|
+
|
|
89
|
+
# Place collectibles
|
|
90
|
+
self.collectibles = []
|
|
91
|
+
for _ in range(self.num_collectibles):
|
|
92
|
+
while True:
|
|
93
|
+
pos = self.rng.randint(0, self.grid_size, size=2).astype(float)
|
|
94
|
+
if (not np.array_equal(pos, self.agent_pos) and
|
|
95
|
+
not np.array_equal(pos, self.goal_pos) and
|
|
96
|
+
not any(np.array_equal(pos, o) for o in self.obstacles) and
|
|
97
|
+
not any(np.array_equal(pos, c) for c in self.collectibles)):
|
|
98
|
+
self.collectibles.append(pos)
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
return self._get_observation()
|
|
102
|
+
|
|
103
|
+
def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, dict]:
|
|
104
|
+
"""Take a movement action."""
|
|
105
|
+
self.steps += 1
|
|
106
|
+
|
|
107
|
+
# Clip and apply action
|
|
108
|
+
action = np.clip(action, -1.0, 1.0)
|
|
109
|
+
new_pos = self.agent_pos + action
|
|
110
|
+
|
|
111
|
+
# Clip to grid bounds
|
|
112
|
+
new_pos = np.clip(new_pos, 0, self.grid_size - 1)
|
|
113
|
+
|
|
114
|
+
# Check obstacle collision
|
|
115
|
+
hit_obstacle = False
|
|
116
|
+
for obs_pos in self.obstacles:
|
|
117
|
+
if self._check_collision(new_pos, obs_pos):
|
|
118
|
+
hit_obstacle = True
|
|
119
|
+
new_pos = self.agent_pos.copy() # don't move
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
self.agent_pos = new_pos
|
|
123
|
+
|
|
124
|
+
# Compute reward
|
|
125
|
+
reward = 0.0
|
|
126
|
+
|
|
127
|
+
# Check goal reached
|
|
128
|
+
reached_goal = self._check_reached(self.agent_pos, self.goal_pos)
|
|
129
|
+
if reached_goal:
|
|
130
|
+
reward += 10.0
|
|
131
|
+
|
|
132
|
+
# Check collectible pickup
|
|
133
|
+
for i, coll_pos in enumerate(self.collectibles):
|
|
134
|
+
if not self.collected[i] and self._check_reached(self.agent_pos, coll_pos):
|
|
135
|
+
self.collected[i] = True
|
|
136
|
+
reward += 1.0
|
|
137
|
+
|
|
138
|
+
# Small step penalty (encourages efficiency)
|
|
139
|
+
reward -= 0.01
|
|
140
|
+
|
|
141
|
+
# Collision penalty
|
|
142
|
+
if hit_obstacle:
|
|
143
|
+
reward -= 0.5
|
|
144
|
+
|
|
145
|
+
# Check done
|
|
146
|
+
done = reached_goal or self.steps >= self.max_steps
|
|
147
|
+
|
|
148
|
+
info = {
|
|
149
|
+
"reached_goal": reached_goal,
|
|
150
|
+
"hit_obstacle": hit_obstacle,
|
|
151
|
+
"steps": self.steps,
|
|
152
|
+
"collected": sum(self.collected),
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return self._get_observation(), reward, done, info
|
|
156
|
+
|
|
157
|
+
def _check_collision(self, pos1: np.ndarray, pos2: np.ndarray, threshold: float = 0.5) -> bool:
|
|
158
|
+
"""Check if two positions are close enough to collide."""
|
|
159
|
+
return np.linalg.norm(pos1 - pos2) < threshold
|
|
160
|
+
|
|
161
|
+
def _check_reached(self, pos: np.ndarray, target: np.ndarray, threshold: float = 0.5) -> bool:
|
|
162
|
+
"""Check if position has reached the target."""
|
|
163
|
+
return np.linalg.norm(pos - target) < threshold
|
|
164
|
+
|
|
165
|
+
def _get_observation(self) -> np.ndarray:
|
|
166
|
+
"""Build observation vector from current state."""
|
|
167
|
+
obs = []
|
|
168
|
+
|
|
169
|
+
# Agent and goal
|
|
170
|
+
obs.extend(self.agent_pos / self.grid_size) # normalize to [0, 1]
|
|
171
|
+
obs.extend(self.goal_pos / self.grid_size)
|
|
172
|
+
|
|
173
|
+
# Obstacles
|
|
174
|
+
for obs_pos in self.obstacles:
|
|
175
|
+
obs.extend(obs_pos / self.grid_size)
|
|
176
|
+
obs.append(1.0) # exists
|
|
177
|
+
|
|
178
|
+
# Collectibles
|
|
179
|
+
for i, coll_pos in enumerate(self.collectibles):
|
|
180
|
+
obs.extend(coll_pos / self.grid_size)
|
|
181
|
+
obs.append(0.0 if self.collected[i] else 1.0) # exists (0 if collected)
|
|
182
|
+
|
|
183
|
+
return np.array(obs, dtype=np.float32)
|
|
184
|
+
|
|
185
|
+
def render(self) -> np.ndarray:
|
|
186
|
+
"""Render grid as RGB image."""
|
|
187
|
+
img = np.ones((self.grid_size, self.grid_size, 3), dtype=np.uint8) * 240 # light gray background
|
|
188
|
+
|
|
189
|
+
# Draw obstacles (red)
|
|
190
|
+
for obs_pos in self.obstacles:
|
|
191
|
+
x, y = int(obs_pos[0]), int(obs_pos[1])
|
|
192
|
+
img[y, x] = [220, 50, 50]
|
|
193
|
+
|
|
194
|
+
# Draw collectibles (blue) - only if not collected
|
|
195
|
+
for i, coll_pos in enumerate(self.collectibles):
|
|
196
|
+
if not self.collected[i]:
|
|
197
|
+
x, y = int(coll_pos[0]), int(coll_pos[1])
|
|
198
|
+
img[y, x] = [50, 50, 220]
|
|
199
|
+
|
|
200
|
+
# Draw goal (green)
|
|
201
|
+
x, y = int(self.goal_pos[0]), int(self.goal_pos[1])
|
|
202
|
+
img[y, x] = [50, 200, 50]
|
|
203
|
+
|
|
204
|
+
# Draw agent (orange)
|
|
205
|
+
x, y = int(self.agent_pos[0]), int(self.agent_pos[1])
|
|
206
|
+
img[y, x] = [255, 165, 0]
|
|
207
|
+
|
|
208
|
+
return img
|
|
209
|
+
|
|
210
|
+
def get_state(self) -> dict:
|
|
211
|
+
"""Get full state for debugging."""
|
|
212
|
+
return {
|
|
213
|
+
"agent_pos": self.agent_pos.copy(),
|
|
214
|
+
"goal_pos": self.goal_pos.copy(),
|
|
215
|
+
"obstacles": [o.copy() for o in self.obstacles],
|
|
216
|
+
"collectibles": [c.copy() for c in self.collectibles],
|
|
217
|
+
"collected": self.collected.copy(),
|
|
218
|
+
"steps": self.steps,
|
|
219
|
+
}
|