drlab 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- drlab-0.1.0/LICENSE +21 -0
- drlab-0.1.0/PKG-INFO +276 -0
- drlab-0.1.0/README.md +238 -0
- drlab-0.1.0/pyproject.toml +55 -0
- drlab-0.1.0/setup.cfg +4 -0
- drlab-0.1.0/src/drlab/__init__.py +36 -0
- drlab-0.1.0/src/drlab/controllers/__init__.py +6 -0
- drlab-0.1.0/src/drlab/controllers/base.py +17 -0
- drlab-0.1.0/src/drlab/controllers/e_greedy.py +46 -0
- drlab-0.1.0/src/drlab/controllers/greedy.py +20 -0
- drlab-0.1.0/src/drlab/controllers/stochastic_controller.py +17 -0
- drlab-0.1.0/src/drlab/experiments/__init__.py +9 -0
- drlab-0.1.0/src/drlab/experiments/ac_experiment.py +102 -0
- drlab-0.1.0/src/drlab/experiments/dqn_experiment.py +151 -0
- drlab-0.1.0/src/drlab/learners/__init__.py +4 -0
- drlab-0.1.0/src/drlab/learners/actor_critic.py +206 -0
- drlab-0.1.0/src/drlab/learners/dqn.py +149 -0
- drlab-0.1.0/src/drlab/replay/__init__.py +4 -0
- drlab-0.1.0/src/drlab/replay/replay_buffer.py +86 -0
- drlab-0.1.0/src/drlab/replay/transition_batch.py +33 -0
- drlab-0.1.0/src/drlab/runners/__init__.py +3 -0
- drlab-0.1.0/src/drlab/runners/runner.py +123 -0
- drlab-0.1.0/src/drlab.egg-info/PKG-INFO +276 -0
- drlab-0.1.0/src/drlab.egg-info/SOURCES.txt +30 -0
- drlab-0.1.0/src/drlab.egg-info/dependency_links.txt +1 -0
- drlab-0.1.0/src/drlab.egg-info/requires.txt +21 -0
- drlab-0.1.0/src/drlab.egg-info/top_level.txt +1 -0
- drlab-0.1.0/tests/test_controllers.py +80 -0
- drlab-0.1.0/tests/test_learners.py +74 -0
- drlab-0.1.0/tests/test_public_api.py +29 -0
- drlab-0.1.0/tests/test_replay.py +107 -0
- drlab-0.1.0/tests/test_runner.py +81 -0
drlab-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tomàs Osarte
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
drlab-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: drlab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deep Reinforcement Learning kit for research.
|
|
5
|
+
Author: Tomas Osarte
|
|
6
|
+
Keywords: reinforcement-learning,deep-learning,pytorch,gymnasium
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: torch>=2.2
|
|
19
|
+
Requires-Dist: numpy>=1.26
|
|
20
|
+
Requires-Dist: gymnasium>=1.0
|
|
21
|
+
Requires-Dist: tqdm>=4.66
|
|
22
|
+
Requires-Dist: tensorboard>=2.16
|
|
23
|
+
Provides-Extra: experiments
|
|
24
|
+
Requires-Dist: ale-py>=0.11; extra == "experiments"
|
|
25
|
+
Requires-Dist: gymnasium[box2d]>=1.0; extra == "experiments"
|
|
26
|
+
Requires-Dist: matplotlib>=3.8; extra == "experiments"
|
|
27
|
+
Requires-Dist: minigrid>=3.0; extra == "experiments"
|
|
28
|
+
Requires-Dist: opencv-python>=4.10; extra == "experiments"
|
|
29
|
+
Requires-Dist: scipy>=1.11; extra == "experiments"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
32
|
+
Requires-Dist: ipykernel>=6.29; extra == "dev"
|
|
33
|
+
Requires-Dist: notebook>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pip-tools>=7.4; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
36
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# drlab
|
|
40
|
+
|
|
41
|
+
`drlab` is a small deep reinforcement learning package for research code and
|
|
42
|
+
experiments. It provides reusable building blocks for Gymnasium environments:
|
|
43
|
+
|
|
44
|
+
- DQN and actor-critic learners
|
|
45
|
+
- greedy, epsilon-greedy, and stochastic controllers
|
|
46
|
+
- a transition runner for collecting environment interaction
|
|
47
|
+
- replay buffer and transition batch utilities
|
|
48
|
+
- lightweight experiment wrappers with TensorBoard logging
|
|
49
|
+
|
|
50
|
+
The package is designed around small, composable pieces: a PyTorch model,
|
|
51
|
+
controller, runner, learner, and optionally an experiment wrapper.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
From the repository root:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python -m pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
For experiment and development dependencies:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
python -m pip install -e ".[experiments,dev]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Package Overview
|
|
68
|
+
|
|
69
|
+
Public classes are available from the package root:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from drlab import (
|
|
73
|
+
ActorCritic,
|
|
74
|
+
ActorCriticConfig,
|
|
75
|
+
ActorCriticExperiment,
|
|
76
|
+
ActorCriticExperimentConfig,
|
|
77
|
+
Controller,
|
|
78
|
+
DQN,
|
|
79
|
+
DQNConfig,
|
|
80
|
+
DQNExperiment,
|
|
81
|
+
DQNExperimentConfig,
|
|
82
|
+
EpsilonGreedyController,
|
|
83
|
+
GreedyController,
|
|
84
|
+
ReplayBuffer,
|
|
85
|
+
Runner,
|
|
86
|
+
StochasticController,
|
|
87
|
+
TransitionBatch,
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
They can also be imported from their subpackages:
|
|
92
|
+
|
|
93
|
+
| Subpackage | Exports | Purpose |
|
|
94
|
+
| --- | --- | --- |
|
|
95
|
+
| `drlab.learners` | `DQN`, `DQNConfig`, `ActorCritic`, `ActorCriticConfig` | Update PyTorch models from transition batches. |
|
|
96
|
+
| `drlab.controllers` | `Controller`, `GreedyController`, `EpsilonGreedyController`, `StochasticController` | Convert model outputs into environment actions. |
|
|
97
|
+
| `drlab.runners` | `Runner` | Collect transitions from a Gymnasium environment. |
|
|
98
|
+
| `drlab.replay` | `ReplayBuffer`, `TransitionBatch` | Store, sample, move, and concatenate transitions. |
|
|
99
|
+
| `drlab.experiments` | `DQNExperiment`, `DQNExperimentConfig`, `ActorCriticExperiment`, `ActorCriticExperimentConfig` | Run training loops with logging and progress bars. |
|
|
100
|
+
|
|
101
|
+
## Implemented Algorithms
|
|
102
|
+
|
|
103
|
+
| Algorithm | Type | Implementation Summary |
|
|
104
|
+
| --- | --- | --- |
|
|
105
|
+
| DQN | Off-policy value-based RL | Trains a Q-network with one-step TD targets from `(state, action, reward, done, next_state)` batches. It supports replay-buffer training through `DQNExperiment`, target networks, Double DQN action selection, hard or soft target-network updates, gradient clipping, configurable discounting, and custom regularizers. |
|
|
106
|
+
| Actor-Critic | On-policy policy-gradient RL | Trains a shared policy/value network from transition batches and returns. The policy head is optimized with advantage-weighted log probabilities, while the value head can use TD targets or full returns. It supports bootstrapped advantages, optional baseline subtraction, advantage normalization, entropy regularization with annealing, gradient clipping, custom regularizers, and PPO-style clipped policy updates for extra optimization passes. |
|
|
107
|
+
|
|
108
|
+
The package also includes reusable action-selection controllers:
|
|
109
|
+
|
|
110
|
+
- `GreedyController`: deterministic argmax action selection from model scores.
|
|
111
|
+
- `EpsilonGreedyController`: epsilon-greedy exploration with linear annealing.
|
|
112
|
+
- `StochasticController`: samples actions from softmax probabilities.
|
|
113
|
+
|
|
114
|
+
## Model Output Convention
|
|
115
|
+
|
|
116
|
+
Controllers and learners expect the model output to use a shared layout:
|
|
117
|
+
|
|
118
|
+
- DQN models should output at least `num_actions` columns. The first
|
|
119
|
+
`num_actions` columns are treated as action scores.
|
|
120
|
+
- Actor-critic models should output at least `num_actions + 1` columns. The
|
|
121
|
+
first `num_actions` columns are policy logits, and the next column is the
|
|
122
|
+
value estimate.
|
|
123
|
+
|
|
124
|
+
## Quick DQN Example
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import gymnasium as gym
|
|
128
|
+
import torch as th
|
|
129
|
+
|
|
130
|
+
from drlab import (
|
|
131
|
+
DQN,
|
|
132
|
+
DQNConfig,
|
|
133
|
+
DQNExperiment,
|
|
134
|
+
DQNExperimentConfig,
|
|
135
|
+
EpsilonGreedyController,
|
|
136
|
+
GreedyController,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
env = gym.make("CartPole-v1")
|
|
140
|
+
|
|
141
|
+
model = th.nn.Sequential(
|
|
142
|
+
th.nn.Linear(4, 64),
|
|
143
|
+
th.nn.ReLU(),
|
|
144
|
+
th.nn.Linear(64, 2),
|
|
145
|
+
)
|
|
146
|
+
optimizer = th.optim.Adam(model.parameters(), lr=1e-3)
|
|
147
|
+
|
|
148
|
+
learner = DQN(model, optimizer, DQNConfig(num_actions=2))
|
|
149
|
+
controller = EpsilonGreedyController(
|
|
150
|
+
GreedyController(model, num_actions=2),
|
|
151
|
+
num_actions=2,
|
|
152
|
+
max_eps=1.0,
|
|
153
|
+
min_eps=0.05,
|
|
154
|
+
anneal_steps=10_000,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
experiment = DQNExperiment(
|
|
158
|
+
env,
|
|
159
|
+
controller,
|
|
160
|
+
learner,
|
|
161
|
+
DQNExperimentConfig(
|
|
162
|
+
max_steps=20_000,
|
|
163
|
+
run_steps=1,
|
|
164
|
+
batch_size=128,
|
|
165
|
+
log_dir="runs/cartpole_dqn",
|
|
166
|
+
),
|
|
167
|
+
)
|
|
168
|
+
experiment.run()
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Core Components
|
|
172
|
+
|
|
173
|
+
### Learners
|
|
174
|
+
|
|
175
|
+
`DQN` trains a Q-network from `(rewards, dones, states, actions, next_states)`.
|
|
176
|
+
Its config supports target networks, double Q-learning, hard or soft target
|
|
177
|
+
updates, gradient clipping, discounting, and custom regularizers.
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from drlab.learners import DQN, DQNConfig
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
`ActorCritic` trains a policy/value network from transition batches with
|
|
184
|
+
returns. Its config supports TD or return-based value targets, bootstrapped
|
|
185
|
+
advantages, PPO-style clipping, entropy regularization, advantage
|
|
186
|
+
normalization, and custom regularizers.
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from drlab.learners import ActorCritic, ActorCriticConfig
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Controllers
|
|
193
|
+
|
|
194
|
+
Controllers wrap a PyTorch model and expose:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
action = controller.choose(obs)
|
|
198
|
+
probs = controller.probabilities(obs)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Available controllers:
|
|
202
|
+
|
|
203
|
+
- `GreedyController`: selects the highest-scoring action.
|
|
204
|
+
- `EpsilonGreedyController`: wraps another controller and adds annealed random
|
|
205
|
+
exploration.
|
|
206
|
+
- `StochasticController`: samples actions from softmax probabilities.
|
|
207
|
+
|
|
208
|
+
### Runner
|
|
209
|
+
|
|
210
|
+
`Runner` steps through a Gymnasium environment with a controller and returns:
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
batch, ep_returns, ep_lengths, last_episode = runner.run(num_steps)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
`num_steps <= 0` collects one complete episode. Positive values collect up to
|
|
217
|
+
that many transitions. The returned `batch` is a `TransitionBatch`.
|
|
218
|
+
|
|
219
|
+
### Replay
|
|
220
|
+
|
|
221
|
+
`TransitionBatch` stores tensors for:
|
|
222
|
+
|
|
223
|
+
- `states`
|
|
224
|
+
- `actions`
|
|
225
|
+
- `rewards`
|
|
226
|
+
- `dones`
|
|
227
|
+
- `next_states`
|
|
228
|
+
- `returns`
|
|
229
|
+
|
|
230
|
+
It provides `.to(device)` and `.cat(other)` helpers.
|
|
231
|
+
|
|
232
|
+
`ReplayBuffer` stores fixed-capacity NumPy arrays and returns sampled or full
|
|
233
|
+
data as `TransitionBatch` instances:
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
buffer = ReplayBuffer(capacity=10_000, obs_shape=env.observation_space.shape)
|
|
237
|
+
batch = buffer.sample(128)
|
|
238
|
+
all_data = buffer.get_all()
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Experiments
|
|
242
|
+
|
|
243
|
+
Experiment wrappers combine an environment, controller, learner, runner, replay
|
|
244
|
+
buffer behavior, progress bar, and TensorBoard logging.
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from drlab.experiments import (
|
|
248
|
+
ActorCriticExperiment,
|
|
249
|
+
ActorCriticExperimentConfig,
|
|
250
|
+
DQNExperiment,
|
|
251
|
+
DQNExperimentConfig,
|
|
252
|
+
)
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Use `DQNExperiment` for off-policy DQN training and `ActorCriticExperiment` for
|
|
256
|
+
on-policy actor-critic training.
|
|
257
|
+
|
|
258
|
+
## Development
|
|
259
|
+
|
|
260
|
+
Install development dependencies:
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
python -m pip install -e ".[dev]"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Run the test suite:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
python -m unittest discover -v
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Build a wheel:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
python -m build --wheel
|
|
276
|
+
```
|
drlab-0.1.0/README.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# drlab
|
|
2
|
+
|
|
3
|
+
`drlab` is a small deep reinforcement learning package for research code and
|
|
4
|
+
experiments. It provides reusable building blocks for Gymnasium environments:
|
|
5
|
+
|
|
6
|
+
- DQN and actor-critic learners
|
|
7
|
+
- greedy, epsilon-greedy, and stochastic controllers
|
|
8
|
+
- a transition runner for collecting environment interaction
|
|
9
|
+
- replay buffer and transition batch utilities
|
|
10
|
+
- lightweight experiment wrappers with TensorBoard logging
|
|
11
|
+
|
|
12
|
+
The package is designed around small, composable pieces: a PyTorch model,
|
|
13
|
+
controller, runner, learner, and optionally an experiment wrapper.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
From the repository root:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
python -m pip install -e .
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
For experiment and development dependencies:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
python -m pip install -e ".[experiments,dev]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Package Overview
|
|
30
|
+
|
|
31
|
+
Public classes are available from the package root:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from drlab import (
|
|
35
|
+
ActorCritic,
|
|
36
|
+
ActorCriticConfig,
|
|
37
|
+
ActorCriticExperiment,
|
|
38
|
+
ActorCriticExperimentConfig,
|
|
39
|
+
Controller,
|
|
40
|
+
DQN,
|
|
41
|
+
DQNConfig,
|
|
42
|
+
DQNExperiment,
|
|
43
|
+
DQNExperimentConfig,
|
|
44
|
+
EpsilonGreedyController,
|
|
45
|
+
GreedyController,
|
|
46
|
+
ReplayBuffer,
|
|
47
|
+
Runner,
|
|
48
|
+
StochasticController,
|
|
49
|
+
TransitionBatch,
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
They can also be imported from their subpackages:
|
|
54
|
+
|
|
55
|
+
| Subpackage | Exports | Purpose |
|
|
56
|
+
| --- | --- | --- |
|
|
57
|
+
| `drlab.learners` | `DQN`, `DQNConfig`, `ActorCritic`, `ActorCriticConfig` | Update PyTorch models from transition batches. |
|
|
58
|
+
| `drlab.controllers` | `Controller`, `GreedyController`, `EpsilonGreedyController`, `StochasticController` | Convert model outputs into environment actions. |
|
|
59
|
+
| `drlab.runners` | `Runner` | Collect transitions from a Gymnasium environment. |
|
|
60
|
+
| `drlab.replay` | `ReplayBuffer`, `TransitionBatch` | Store, sample, move, and concatenate transitions. |
|
|
61
|
+
| `drlab.experiments` | `DQNExperiment`, `DQNExperimentConfig`, `ActorCriticExperiment`, `ActorCriticExperimentConfig` | Run training loops with logging and progress bars. |
|
|
62
|
+
|
|
63
|
+
## Implemented Algorithms
|
|
64
|
+
|
|
65
|
+
| Algorithm | Type | Implementation Summary |
|
|
66
|
+
| --- | --- | --- |
|
|
67
|
+
| DQN | Off-policy value-based RL | Trains a Q-network with one-step TD targets from `(state, action, reward, done, next_state)` batches. It supports replay-buffer training through `DQNExperiment`, target networks, Double DQN action selection, hard or soft target-network updates, gradient clipping, configurable discounting, and custom regularizers. |
|
|
68
|
+
| Actor-Critic | On-policy policy-gradient RL | Trains a shared policy/value network from transition batches and returns. The policy head is optimized with advantage-weighted log probabilities, while the value head can use TD targets or full returns. It supports bootstrapped advantages, optional baseline subtraction, advantage normalization, entropy regularization with annealing, gradient clipping, custom regularizers, and PPO-style clipped policy updates for extra optimization passes. |
|
|
69
|
+
|
|
70
|
+
The package also includes reusable action-selection controllers:
|
|
71
|
+
|
|
72
|
+
- `GreedyController`: deterministic argmax action selection from model scores.
|
|
73
|
+
- `EpsilonGreedyController`: epsilon-greedy exploration with linear annealing.
|
|
74
|
+
- `StochasticController`: samples actions from softmax probabilities.
|
|
75
|
+
|
|
76
|
+
## Model Output Convention
|
|
77
|
+
|
|
78
|
+
Controllers and learners expect the model output to use a shared layout:
|
|
79
|
+
|
|
80
|
+
- DQN models should output at least `num_actions` columns. The first
|
|
81
|
+
`num_actions` columns are treated as action scores.
|
|
82
|
+
- Actor-critic models should output at least `num_actions + 1` columns. The
|
|
83
|
+
first `num_actions` columns are policy logits, and the next column is the
|
|
84
|
+
value estimate.
|
|
85
|
+
|
|
86
|
+
## Quick DQN Example
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import gymnasium as gym
|
|
90
|
+
import torch as th
|
|
91
|
+
|
|
92
|
+
from drlab import (
|
|
93
|
+
DQN,
|
|
94
|
+
DQNConfig,
|
|
95
|
+
DQNExperiment,
|
|
96
|
+
DQNExperimentConfig,
|
|
97
|
+
EpsilonGreedyController,
|
|
98
|
+
GreedyController,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
env = gym.make("CartPole-v1")
|
|
102
|
+
|
|
103
|
+
model = th.nn.Sequential(
|
|
104
|
+
th.nn.Linear(4, 64),
|
|
105
|
+
th.nn.ReLU(),
|
|
106
|
+
th.nn.Linear(64, 2),
|
|
107
|
+
)
|
|
108
|
+
optimizer = th.optim.Adam(model.parameters(), lr=1e-3)
|
|
109
|
+
|
|
110
|
+
learner = DQN(model, optimizer, DQNConfig(num_actions=2))
|
|
111
|
+
controller = EpsilonGreedyController(
|
|
112
|
+
GreedyController(model, num_actions=2),
|
|
113
|
+
num_actions=2,
|
|
114
|
+
max_eps=1.0,
|
|
115
|
+
min_eps=0.05,
|
|
116
|
+
anneal_steps=10_000,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
experiment = DQNExperiment(
|
|
120
|
+
env,
|
|
121
|
+
controller,
|
|
122
|
+
learner,
|
|
123
|
+
DQNExperimentConfig(
|
|
124
|
+
max_steps=20_000,
|
|
125
|
+
run_steps=1,
|
|
126
|
+
batch_size=128,
|
|
127
|
+
log_dir="runs/cartpole_dqn",
|
|
128
|
+
),
|
|
129
|
+
)
|
|
130
|
+
experiment.run()
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Core Components
|
|
134
|
+
|
|
135
|
+
### Learners
|
|
136
|
+
|
|
137
|
+
`DQN` trains a Q-network from `(rewards, dones, states, actions, next_states)`.
|
|
138
|
+
Its config supports target networks, double Q-learning, hard or soft target
|
|
139
|
+
updates, gradient clipping, discounting, and custom regularizers.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from drlab.learners import DQN, DQNConfig
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
`ActorCritic` trains a policy/value network from transition batches with
|
|
146
|
+
returns. Its config supports TD or return-based value targets, bootstrapped
|
|
147
|
+
advantages, PPO-style clipping, entropy regularization, advantage
|
|
148
|
+
normalization, and custom regularizers.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from drlab.learners import ActorCritic, ActorCriticConfig
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Controllers
|
|
155
|
+
|
|
156
|
+
Controllers wrap a PyTorch model and expose:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
action = controller.choose(obs)
|
|
160
|
+
probs = controller.probabilities(obs)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Available controllers:
|
|
164
|
+
|
|
165
|
+
- `GreedyController`: selects the highest-scoring action.
|
|
166
|
+
- `EpsilonGreedyController`: wraps another controller and adds annealed random
|
|
167
|
+
exploration.
|
|
168
|
+
- `StochasticController`: samples actions from softmax probabilities.
|
|
169
|
+
|
|
170
|
+
### Runner
|
|
171
|
+
|
|
172
|
+
`Runner` steps through a Gymnasium environment with a controller and returns:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
batch, ep_returns, ep_lengths, last_episode = runner.run(num_steps)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
`num_steps <= 0` collects one complete episode. Positive values collect up to
|
|
179
|
+
that many transitions. The returned `batch` is a `TransitionBatch`.
|
|
180
|
+
|
|
181
|
+
### Replay
|
|
182
|
+
|
|
183
|
+
`TransitionBatch` stores tensors for:
|
|
184
|
+
|
|
185
|
+
- `states`
|
|
186
|
+
- `actions`
|
|
187
|
+
- `rewards`
|
|
188
|
+
- `dones`
|
|
189
|
+
- `next_states`
|
|
190
|
+
- `returns`
|
|
191
|
+
|
|
192
|
+
It provides `.to(device)` and `.cat(other)` helpers.
|
|
193
|
+
|
|
194
|
+
`ReplayBuffer` stores fixed-capacity NumPy arrays and returns sampled or full
|
|
195
|
+
data as `TransitionBatch` instances:
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
buffer = ReplayBuffer(capacity=10_000, obs_shape=env.observation_space.shape)
|
|
199
|
+
batch = buffer.sample(128)
|
|
200
|
+
all_data = buffer.get_all()
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Experiments
|
|
204
|
+
|
|
205
|
+
Experiment wrappers combine an environment, controller, learner, runner, replay
|
|
206
|
+
buffer behavior, progress bar, and TensorBoard logging.
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from drlab.experiments import (
|
|
210
|
+
ActorCriticExperiment,
|
|
211
|
+
ActorCriticExperimentConfig,
|
|
212
|
+
DQNExperiment,
|
|
213
|
+
DQNExperimentConfig,
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Use `DQNExperiment` for off-policy DQN training and `ActorCriticExperiment` for
|
|
218
|
+
on-policy actor-critic training.
|
|
219
|
+
|
|
220
|
+
## Development
|
|
221
|
+
|
|
222
|
+
Install development dependencies:
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
python -m pip install -e ".[dev]"
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Run the test suite:
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
python -m unittest discover -v
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Build a wheel:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
python -m build --wheel
|
|
238
|
+
```
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "drlab"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Deep Reinforcement Learning kit for research."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Tomas Osarte" }]
|
|
12
|
+
keywords = ["reinforcement-learning", "deep-learning", "pytorch", "gymnasium"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.2",
|
|
25
|
+
"numpy>=1.26",
|
|
26
|
+
"gymnasium>=1.0",
|
|
27
|
+
"tqdm>=4.66",
|
|
28
|
+
"tensorboard>=2.16",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
experiments = [
|
|
33
|
+
"ale-py>=0.11",
|
|
34
|
+
"gymnasium[box2d]>=1.0",
|
|
35
|
+
"matplotlib>=3.8",
|
|
36
|
+
"minigrid>=3.0",
|
|
37
|
+
"opencv-python>=4.10",
|
|
38
|
+
"scipy>=1.11",
|
|
39
|
+
]
|
|
40
|
+
dev = [
|
|
41
|
+
"build>=1.2",
|
|
42
|
+
"ipykernel>=6.29",
|
|
43
|
+
"notebook>=7.0",
|
|
44
|
+
"pip-tools>=7.4",
|
|
45
|
+
"pytest>=8.0",
|
|
46
|
+
"ruff>=0.6",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.setuptools]
|
|
50
|
+
package-dir = {"" = "src"}
|
|
51
|
+
include-package-data = true
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
where = ["src"]
|
|
55
|
+
include = ["drlab*"]
|
drlab-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from drlab.controllers import (
|
|
2
|
+
Controller,
|
|
3
|
+
EpsilonGreedyController,
|
|
4
|
+
GreedyController,
|
|
5
|
+
StochasticController,
|
|
6
|
+
)
|
|
7
|
+
from drlab.experiments import (
|
|
8
|
+
ActorCriticExperiment,
|
|
9
|
+
ActorCriticExperimentConfig,
|
|
10
|
+
DQNExperiment,
|
|
11
|
+
DQNExperimentConfig,
|
|
12
|
+
)
|
|
13
|
+
from drlab.learners import ActorCritic, ActorCriticConfig, DQN, DQNConfig
|
|
14
|
+
from drlab.replay import ReplayBuffer, TransitionBatch
|
|
15
|
+
from drlab.runners import Runner
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"__version__",
|
|
21
|
+
"ActorCritic",
|
|
22
|
+
"ActorCriticConfig",
|
|
23
|
+
"ActorCriticExperiment",
|
|
24
|
+
"ActorCriticExperimentConfig",
|
|
25
|
+
"Controller",
|
|
26
|
+
"DQN",
|
|
27
|
+
"DQNConfig",
|
|
28
|
+
"DQNExperiment",
|
|
29
|
+
"DQNExperimentConfig",
|
|
30
|
+
"EpsilonGreedyController",
|
|
31
|
+
"GreedyController",
|
|
32
|
+
"ReplayBuffer",
|
|
33
|
+
"Runner",
|
|
34
|
+
"StochasticController",
|
|
35
|
+
"TransitionBatch",
|
|
36
|
+
]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from .base import Controller
|
|
2
|
+
from .greedy import GreedyController
|
|
3
|
+
from .e_greedy import EpsilonGreedyController
|
|
4
|
+
from .stochastic_controller import StochasticController
|
|
5
|
+
|
|
6
|
+
__all__ = ["Controller", "GreedyController", "EpsilonGreedyController", "StochasticController"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import torch as th
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
|
|
6
|
+
class Controller(ABC):
|
|
7
|
+
"""Abstract controller interface."""
|
|
8
|
+
|
|
9
|
+
num_actions: int
|
|
10
|
+
model: th.nn.Module = None
|
|
11
|
+
controller: Controller = None
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def choose(self, obs: th.Tensor, **kwargs) -> th.Tensor: ...
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def probabilities(self, obs: th.Tensor, **kwargs) -> th.Tensor: ...
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import torch as th
|
|
3
|
+
from .base import Controller
|
|
4
|
+
|
|
5
|
+
class EpsilonGreedyController(Controller):
|
|
6
|
+
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
controller: Controller,
|
|
10
|
+
num_actions: int,
|
|
11
|
+
max_eps: float = 1.0,
|
|
12
|
+
min_eps: float = 0.1,
|
|
13
|
+
anneal_steps: int = 10_000,
|
|
14
|
+
):
|
|
15
|
+
self.controller = controller
|
|
16
|
+
self.num_actions = num_actions
|
|
17
|
+
self.model = controller.model
|
|
18
|
+
self.max_eps = max_eps
|
|
19
|
+
self.min_eps = min_eps
|
|
20
|
+
self.anneal_steps = anneal_steps
|
|
21
|
+
self.num_decisions = 0
|
|
22
|
+
|
|
23
|
+
if anneal_steps <= 1:
|
|
24
|
+
raise ValueError("anneal_steps must be >= 2")
|
|
25
|
+
|
|
26
|
+
def epsilon(self) -> float:
|
|
27
|
+
frac = max(1 - self.num_decisions / (self.anneal_steps - 1), 0.0)
|
|
28
|
+
return frac * (self.max_eps - self.min_eps) + self.min_eps
|
|
29
|
+
|
|
30
|
+
def choose(self, obs: th.Tensor, increase_counter: bool = True, **kwargs) -> th.Tensor:
|
|
31
|
+
eps = self.epsilon()
|
|
32
|
+
if increase_counter:
|
|
33
|
+
self.num_decisions += 1
|
|
34
|
+
|
|
35
|
+
B = obs.shape[0] if obs.ndim > 1 else 1
|
|
36
|
+
if np.random.rand() < eps:
|
|
37
|
+
return th.randint(self.num_actions, (B,), device=obs.device, dtype=th.long)
|
|
38
|
+
|
|
39
|
+
return self.controller.choose(obs, **kwargs)
|
|
40
|
+
|
|
41
|
+
def probabilities(self, obs: th.Tensor, **kwargs) -> th.Tensor:
|
|
42
|
+
eps = self.epsilon()
|
|
43
|
+
greedy = self.controller.probabilities(obs, **kwargs) # one-hot on argmax, shape [B,A]
|
|
44
|
+
B = greedy.shape[0]
|
|
45
|
+
uniform = th.full((B, self.num_actions), eps / self.num_actions, device=greedy.device)
|
|
46
|
+
return uniform + (1 - eps) * greedy
|