drlab 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. drlab-0.1.0/LICENSE +21 -0
  2. drlab-0.1.0/PKG-INFO +276 -0
  3. drlab-0.1.0/README.md +238 -0
  4. drlab-0.1.0/pyproject.toml +55 -0
  5. drlab-0.1.0/setup.cfg +4 -0
  6. drlab-0.1.0/src/drlab/__init__.py +36 -0
  7. drlab-0.1.0/src/drlab/controllers/__init__.py +6 -0
  8. drlab-0.1.0/src/drlab/controllers/base.py +17 -0
  9. drlab-0.1.0/src/drlab/controllers/e_greedy.py +46 -0
  10. drlab-0.1.0/src/drlab/controllers/greedy.py +20 -0
  11. drlab-0.1.0/src/drlab/controllers/stochastic_controller.py +17 -0
  12. drlab-0.1.0/src/drlab/experiments/__init__.py +9 -0
  13. drlab-0.1.0/src/drlab/experiments/ac_experiment.py +102 -0
  14. drlab-0.1.0/src/drlab/experiments/dqn_experiment.py +151 -0
  15. drlab-0.1.0/src/drlab/learners/__init__.py +4 -0
  16. drlab-0.1.0/src/drlab/learners/actor_critic.py +206 -0
  17. drlab-0.1.0/src/drlab/learners/dqn.py +149 -0
  18. drlab-0.1.0/src/drlab/replay/__init__.py +4 -0
  19. drlab-0.1.0/src/drlab/replay/replay_buffer.py +86 -0
  20. drlab-0.1.0/src/drlab/replay/transition_batch.py +33 -0
  21. drlab-0.1.0/src/drlab/runners/__init__.py +3 -0
  22. drlab-0.1.0/src/drlab/runners/runner.py +123 -0
  23. drlab-0.1.0/src/drlab.egg-info/PKG-INFO +276 -0
  24. drlab-0.1.0/src/drlab.egg-info/SOURCES.txt +30 -0
  25. drlab-0.1.0/src/drlab.egg-info/dependency_links.txt +1 -0
  26. drlab-0.1.0/src/drlab.egg-info/requires.txt +21 -0
  27. drlab-0.1.0/src/drlab.egg-info/top_level.txt +1 -0
  28. drlab-0.1.0/tests/test_controllers.py +80 -0
  29. drlab-0.1.0/tests/test_learners.py +74 -0
  30. drlab-0.1.0/tests/test_public_api.py +29 -0
  31. drlab-0.1.0/tests/test_replay.py +107 -0
  32. drlab-0.1.0/tests/test_runner.py +81 -0
drlab-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Tomàs Osarte
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
drlab-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,276 @@
1
+ Metadata-Version: 2.4
2
+ Name: drlab
3
+ Version: 0.1.0
4
+ Summary: Deep Reinforcement Learning kit for research.
5
+ Author: Tomas Osarte
6
+ Keywords: reinforcement-learning,deep-learning,pytorch,gymnasium
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: torch>=2.2
19
+ Requires-Dist: numpy>=1.26
20
+ Requires-Dist: gymnasium>=1.0
21
+ Requires-Dist: tqdm>=4.66
22
+ Requires-Dist: tensorboard>=2.16
23
+ Provides-Extra: experiments
24
+ Requires-Dist: ale-py>=0.11; extra == "experiments"
25
+ Requires-Dist: gymnasium[box2d]>=1.0; extra == "experiments"
26
+ Requires-Dist: matplotlib>=3.8; extra == "experiments"
27
+ Requires-Dist: minigrid>=3.0; extra == "experiments"
28
+ Requires-Dist: opencv-python>=4.10; extra == "experiments"
29
+ Requires-Dist: scipy>=1.11; extra == "experiments"
30
+ Provides-Extra: dev
31
+ Requires-Dist: build>=1.2; extra == "dev"
32
+ Requires-Dist: ipykernel>=6.29; extra == "dev"
33
+ Requires-Dist: notebook>=7.0; extra == "dev"
34
+ Requires-Dist: pip-tools>=7.4; extra == "dev"
35
+ Requires-Dist: pytest>=8.0; extra == "dev"
36
+ Requires-Dist: ruff>=0.6; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # drlab
40
+
41
+ `drlab` is a small deep reinforcement learning package for research code and
42
+ experiments. It provides reusable building blocks for Gymnasium environments:
43
+
44
+ - DQN and actor-critic learners
45
+ - greedy, epsilon-greedy, and stochastic controllers
46
+ - a transition runner for collecting environment interaction
47
+ - replay buffer and transition batch utilities
48
+ - lightweight experiment wrappers with TensorBoard logging
49
+
50
+ The package is designed around small, composable pieces: a PyTorch model,
51
+ controller, runner, learner, and optionally an experiment wrapper.
52
+
53
+ ## Installation
54
+
55
+ From the repository root:
56
+
57
+ ```bash
58
+ python -m pip install -e .
59
+ ```
60
+
61
+ For experiment and development dependencies:
62
+
63
+ ```bash
64
+ python -m pip install -e ".[experiments,dev]"
65
+ ```
66
+
67
+ ## Package Overview
68
+
69
+ Public classes are available from the package root:
70
+
71
+ ```python
72
+ from drlab import (
73
+ ActorCritic,
74
+ ActorCriticConfig,
75
+ ActorCriticExperiment,
76
+ ActorCriticExperimentConfig,
77
+ Controller,
78
+ DQN,
79
+ DQNConfig,
80
+ DQNExperiment,
81
+ DQNExperimentConfig,
82
+ EpsilonGreedyController,
83
+ GreedyController,
84
+ ReplayBuffer,
85
+ Runner,
86
+ StochasticController,
87
+ TransitionBatch,
88
+ )
89
+ ```
90
+
91
+ They can also be imported from their subpackages:
92
+
93
+ | Subpackage | Exports | Purpose |
94
+ | --- | --- | --- |
95
+ | `drlab.learners` | `DQN`, `DQNConfig`, `ActorCritic`, `ActorCriticConfig` | Update PyTorch models from transition batches. |
96
+ | `drlab.controllers` | `Controller`, `GreedyController`, `EpsilonGreedyController`, `StochasticController` | Convert model outputs into environment actions. |
97
+ | `drlab.runners` | `Runner` | Collect transitions from a Gymnasium environment. |
98
+ | `drlab.replay` | `ReplayBuffer`, `TransitionBatch` | Store, sample, move, and concatenate transitions. |
99
+ | `drlab.experiments` | `DQNExperiment`, `DQNExperimentConfig`, `ActorCriticExperiment`, `ActorCriticExperimentConfig` | Run training loops with logging and progress bars. |
100
+
101
+ ## Implemented Algorithms
102
+
103
+ | Algorithm | Type | Implementation Summary |
104
+ | --- | --- | --- |
105
+ | DQN | Off-policy value-based RL | Trains a Q-network with one-step TD targets from `(state, action, reward, done, next_state)` batches. It supports replay-buffer training through `DQNExperiment`, target networks, Double DQN action selection, hard or soft target-network updates, gradient clipping, configurable discounting, and custom regularizers. |
106
+ | Actor-Critic | On-policy policy-gradient RL | Trains a shared policy/value network from transition batches and returns. The policy head is optimized with advantage-weighted log probabilities, while the value head can use TD targets or full returns. It supports bootstrapped advantages, optional baseline subtraction, advantage normalization, entropy regularization with annealing, gradient clipping, custom regularizers, and PPO-style clipped policy updates for extra optimization passes. |
107
+
108
+ The package also includes reusable action-selection controllers:
109
+
110
+ - `GreedyController`: deterministic argmax action selection from model scores.
111
+ - `EpsilonGreedyController`: epsilon-greedy exploration with linear annealing.
112
+ - `StochasticController`: samples actions from softmax probabilities.
113
+
114
+ ## Model Output Convention
115
+
116
+ Controllers and learners expect the model output to use a shared layout:
117
+
118
+ - DQN models should output at least `num_actions` columns. The first
119
+ `num_actions` columns are treated as action scores.
120
+ - Actor-critic models should output at least `num_actions + 1` columns. The
121
+ first `num_actions` columns are policy logits, and the next column is the
122
+ value estimate.
123
+
124
+ ## Quick DQN Example
125
+
126
+ ```python
127
+ import gymnasium as gym
128
+ import torch as th
129
+
130
+ from drlab import (
131
+ DQN,
132
+ DQNConfig,
133
+ DQNExperiment,
134
+ DQNExperimentConfig,
135
+ EpsilonGreedyController,
136
+ GreedyController,
137
+ )
138
+
139
+ env = gym.make("CartPole-v1")
140
+
141
+ model = th.nn.Sequential(
142
+ th.nn.Linear(4, 64),
143
+ th.nn.ReLU(),
144
+ th.nn.Linear(64, 2),
145
+ )
146
+ optimizer = th.optim.Adam(model.parameters(), lr=1e-3)
147
+
148
+ learner = DQN(model, optimizer, DQNConfig(num_actions=2))
149
+ controller = EpsilonGreedyController(
150
+ GreedyController(model, num_actions=2),
151
+ num_actions=2,
152
+ max_eps=1.0,
153
+ min_eps=0.05,
154
+ anneal_steps=10_000,
155
+ )
156
+
157
+ experiment = DQNExperiment(
158
+ env,
159
+ controller,
160
+ learner,
161
+ DQNExperimentConfig(
162
+ max_steps=20_000,
163
+ run_steps=1,
164
+ batch_size=128,
165
+ log_dir="runs/cartpole_dqn",
166
+ ),
167
+ )
168
+ experiment.run()
169
+ ```
170
+
171
+ ## Core Components
172
+
173
+ ### Learners
174
+
175
+ `DQN` trains a Q-network from `(rewards, dones, states, actions, next_states)`.
176
+ Its config supports target networks, double Q-learning, hard or soft target
177
+ updates, gradient clipping, discounting, and custom regularizers.
178
+
179
+ ```python
180
+ from drlab.learners import DQN, DQNConfig
181
+ ```
182
+
183
+ `ActorCritic` trains a policy/value network from transition batches with
184
+ returns. Its config supports TD or return-based value targets, bootstrapped
185
+ advantages, PPO-style clipping, entropy regularization, advantage
186
+ normalization, and custom regularizers.
187
+
188
+ ```python
189
+ from drlab.learners import ActorCritic, ActorCriticConfig
190
+ ```
191
+
192
+ ### Controllers
193
+
194
+ Controllers wrap a PyTorch model and expose:
195
+
196
+ ```python
197
+ action = controller.choose(obs)
198
+ probs = controller.probabilities(obs)
199
+ ```
200
+
201
+ Available controllers:
202
+
203
+ - `GreedyController`: selects the highest-scoring action.
204
+ - `EpsilonGreedyController`: wraps another controller and adds annealed random
205
+ exploration.
206
+ - `StochasticController`: samples actions from softmax probabilities.
207
+
208
+ ### Runner
209
+
210
+ `Runner` steps through a Gymnasium environment with a controller and returns:
211
+
212
+ ```python
213
+ batch, ep_returns, ep_lengths, last_episode = runner.run(num_steps)
214
+ ```
215
+
216
+ `num_steps <= 0` collects one complete episode. Positive values collect up to
217
+ that many transitions. The returned `batch` is a `TransitionBatch`.
218
+
219
+ ### Replay
220
+
221
+ `TransitionBatch` stores tensors for:
222
+
223
+ - `states`
224
+ - `actions`
225
+ - `rewards`
226
+ - `dones`
227
+ - `next_states`
228
+ - `returns`
229
+
230
+ It provides `.to(device)` and `.cat(other)` helpers.
231
+
232
+ `ReplayBuffer` stores fixed-capacity NumPy arrays and returns sampled or full
233
+ data as `TransitionBatch` instances:
234
+
235
+ ```python
236
+ buffer = ReplayBuffer(capacity=10_000, obs_shape=env.observation_space.shape)
237
+ batch = buffer.sample(128)
238
+ all_data = buffer.get_all()
239
+ ```
240
+
241
+ ### Experiments
242
+
243
+ Experiment wrappers combine an environment, controller, learner, runner, replay
244
+ buffer behavior, progress bar, and TensorBoard logging.
245
+
246
+ ```python
247
+ from drlab.experiments import (
248
+ ActorCriticExperiment,
249
+ ActorCriticExperimentConfig,
250
+ DQNExperiment,
251
+ DQNExperimentConfig,
252
+ )
253
+ ```
254
+
255
+ Use `DQNExperiment` for off-policy DQN training and `ActorCriticExperiment` for
256
+ on-policy actor-critic training.
257
+
258
+ ## Development
259
+
260
+ Install development dependencies:
261
+
262
+ ```bash
263
+ python -m pip install -e ".[dev]"
264
+ ```
265
+
266
+ Run the test suite:
267
+
268
+ ```bash
269
+ python -m unittest discover -v
270
+ ```
271
+
272
+ Build a wheel:
273
+
274
+ ```bash
275
+ python -m build --wheel
276
+ ```
drlab-0.1.0/README.md ADDED
@@ -0,0 +1,238 @@
1
+ # drlab
2
+
3
+ `drlab` is a small deep reinforcement learning package for research code and
4
+ experiments. It provides reusable building blocks for Gymnasium environments:
5
+
6
+ - DQN and actor-critic learners
7
+ - greedy, epsilon-greedy, and stochastic controllers
8
+ - a transition runner for collecting environment interaction
9
+ - replay buffer and transition batch utilities
10
+ - lightweight experiment wrappers with TensorBoard logging
11
+
12
+ The package is designed around small, composable pieces: a PyTorch model,
13
+ controller, runner, learner, and optionally an experiment wrapper.
14
+
15
+ ## Installation
16
+
17
+ From the repository root:
18
+
19
+ ```bash
20
+ python -m pip install -e .
21
+ ```
22
+
23
+ For experiment and development dependencies:
24
+
25
+ ```bash
26
+ python -m pip install -e ".[experiments,dev]"
27
+ ```
28
+
29
+ ## Package Overview
30
+
31
+ Public classes are available from the package root:
32
+
33
+ ```python
34
+ from drlab import (
35
+ ActorCritic,
36
+ ActorCriticConfig,
37
+ ActorCriticExperiment,
38
+ ActorCriticExperimentConfig,
39
+ Controller,
40
+ DQN,
41
+ DQNConfig,
42
+ DQNExperiment,
43
+ DQNExperimentConfig,
44
+ EpsilonGreedyController,
45
+ GreedyController,
46
+ ReplayBuffer,
47
+ Runner,
48
+ StochasticController,
49
+ TransitionBatch,
50
+ )
51
+ ```
52
+
53
+ They can also be imported from their subpackages:
54
+
55
+ | Subpackage | Exports | Purpose |
56
+ | --- | --- | --- |
57
+ | `drlab.learners` | `DQN`, `DQNConfig`, `ActorCritic`, `ActorCriticConfig` | Update PyTorch models from transition batches. |
58
+ | `drlab.controllers` | `Controller`, `GreedyController`, `EpsilonGreedyController`, `StochasticController` | Convert model outputs into environment actions. |
59
+ | `drlab.runners` | `Runner` | Collect transitions from a Gymnasium environment. |
60
+ | `drlab.replay` | `ReplayBuffer`, `TransitionBatch` | Store, sample, move, and concatenate transitions. |
61
+ | `drlab.experiments` | `DQNExperiment`, `DQNExperimentConfig`, `ActorCriticExperiment`, `ActorCriticExperimentConfig` | Run training loops with logging and progress bars. |
62
+
63
+ ## Implemented Algorithms
64
+
65
+ | Algorithm | Type | Implementation Summary |
66
+ | --- | --- | --- |
67
+ | DQN | Off-policy value-based RL | Trains a Q-network with one-step TD targets from `(state, action, reward, done, next_state)` batches. It supports replay-buffer training through `DQNExperiment`, target networks, Double DQN action selection, hard or soft target-network updates, gradient clipping, configurable discounting, and custom regularizers. |
68
+ | Actor-Critic | On-policy policy-gradient RL | Trains a shared policy/value network from transition batches and returns. The policy head is optimized with advantage-weighted log probabilities, while the value head can use TD targets or full returns. It supports bootstrapped advantages, optional baseline subtraction, advantage normalization, entropy regularization with annealing, gradient clipping, custom regularizers, and PPO-style clipped policy updates for extra optimization passes. |
69
+
70
+ The package also includes reusable action-selection controllers:
71
+
72
+ - `GreedyController`: deterministic argmax action selection from model scores.
73
+ - `EpsilonGreedyController`: epsilon-greedy exploration with linear annealing.
74
+ - `StochasticController`: samples actions from softmax probabilities.
75
+
76
+ ## Model Output Convention
77
+
78
+ Controllers and learners expect the model output to use a shared layout:
79
+
80
+ - DQN models should output at least `num_actions` columns. The first
81
+ `num_actions` columns are treated as action scores.
82
+ - Actor-critic models should output at least `num_actions + 1` columns. The
83
+ first `num_actions` columns are policy logits, and the next column is the
84
+ value estimate.
85
+
86
+ ## Quick DQN Example
87
+
88
+ ```python
89
+ import gymnasium as gym
90
+ import torch as th
91
+
92
+ from drlab import (
93
+ DQN,
94
+ DQNConfig,
95
+ DQNExperiment,
96
+ DQNExperimentConfig,
97
+ EpsilonGreedyController,
98
+ GreedyController,
99
+ )
100
+
101
+ env = gym.make("CartPole-v1")
102
+
103
+ model = th.nn.Sequential(
104
+ th.nn.Linear(4, 64),
105
+ th.nn.ReLU(),
106
+ th.nn.Linear(64, 2),
107
+ )
108
+ optimizer = th.optim.Adam(model.parameters(), lr=1e-3)
109
+
110
+ learner = DQN(model, optimizer, DQNConfig(num_actions=2))
111
+ controller = EpsilonGreedyController(
112
+ GreedyController(model, num_actions=2),
113
+ num_actions=2,
114
+ max_eps=1.0,
115
+ min_eps=0.05,
116
+ anneal_steps=10_000,
117
+ )
118
+
119
+ experiment = DQNExperiment(
120
+ env,
121
+ controller,
122
+ learner,
123
+ DQNExperimentConfig(
124
+ max_steps=20_000,
125
+ run_steps=1,
126
+ batch_size=128,
127
+ log_dir="runs/cartpole_dqn",
128
+ ),
129
+ )
130
+ experiment.run()
131
+ ```
132
+
133
+ ## Core Components
134
+
135
+ ### Learners
136
+
137
+ `DQN` trains a Q-network from `(rewards, dones, states, actions, next_states)`.
138
+ Its config supports target networks, double Q-learning, hard or soft target
139
+ updates, gradient clipping, discounting, and custom regularizers.
140
+
141
+ ```python
142
+ from drlab.learners import DQN, DQNConfig
143
+ ```
144
+
145
+ `ActorCritic` trains a policy/value network from transition batches with
146
+ returns. Its config supports TD or return-based value targets, bootstrapped
147
+ advantages, PPO-style clipping, entropy regularization, advantage
148
+ normalization, and custom regularizers.
149
+
150
+ ```python
151
+ from drlab.learners import ActorCritic, ActorCriticConfig
152
+ ```
153
+
154
+ ### Controllers
155
+
156
+ Controllers wrap a PyTorch model and expose:
157
+
158
+ ```python
159
+ action = controller.choose(obs)
160
+ probs = controller.probabilities(obs)
161
+ ```
162
+
163
+ Available controllers:
164
+
165
+ - `GreedyController`: selects the highest-scoring action.
166
+ - `EpsilonGreedyController`: wraps another controller and adds annealed random
167
+ exploration.
168
+ - `StochasticController`: samples actions from softmax probabilities.
169
+
170
+ ### Runner
171
+
172
+ `Runner` steps through a Gymnasium environment with a controller and returns:
173
+
174
+ ```python
175
+ batch, ep_returns, ep_lengths, last_episode = runner.run(num_steps)
176
+ ```
177
+
178
+ `num_steps <= 0` collects one complete episode. Positive values collect up to
179
+ that many transitions. The returned `batch` is a `TransitionBatch`.
180
+
181
+ ### Replay
182
+
183
+ `TransitionBatch` stores tensors for:
184
+
185
+ - `states`
186
+ - `actions`
187
+ - `rewards`
188
+ - `dones`
189
+ - `next_states`
190
+ - `returns`
191
+
192
+ It provides `.to(device)` and `.cat(other)` helpers.
193
+
194
+ `ReplayBuffer` stores fixed-capacity NumPy arrays and returns sampled or full
195
+ data as `TransitionBatch` instances:
196
+
197
+ ```python
198
+ buffer = ReplayBuffer(capacity=10_000, obs_shape=env.observation_space.shape)
199
+ batch = buffer.sample(128)
200
+ all_data = buffer.get_all()
201
+ ```
202
+
203
+ ### Experiments
204
+
205
+ Experiment wrappers combine an environment, controller, learner, runner, replay
206
+ buffer behavior, progress bar, and TensorBoard logging.
207
+
208
+ ```python
209
+ from drlab.experiments import (
210
+ ActorCriticExperiment,
211
+ ActorCriticExperimentConfig,
212
+ DQNExperiment,
213
+ DQNExperimentConfig,
214
+ )
215
+ ```
216
+
217
+ Use `DQNExperiment` for off-policy DQN training and `ActorCriticExperiment` for
218
+ on-policy actor-critic training.
219
+
220
+ ## Development
221
+
222
+ Install development dependencies:
223
+
224
+ ```bash
225
+ python -m pip install -e ".[dev]"
226
+ ```
227
+
228
+ Run the test suite:
229
+
230
+ ```bash
231
+ python -m unittest discover -v
232
+ ```
233
+
234
+ Build a wheel:
235
+
236
+ ```bash
237
+ python -m build --wheel
238
+ ```
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "drlab"
7
+ version = "0.1.0"
8
+ description = "Deep Reinforcement Learning kit for research."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Tomas Osarte" }]
12
+ keywords = ["reinforcement-learning", "deep-learning", "pytorch", "gymnasium"]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Science/Research",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "torch>=2.2",
25
+ "numpy>=1.26",
26
+ "gymnasium>=1.0",
27
+ "tqdm>=4.66",
28
+ "tensorboard>=2.16",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ experiments = [
33
+ "ale-py>=0.11",
34
+ "gymnasium[box2d]>=1.0",
35
+ "matplotlib>=3.8",
36
+ "minigrid>=3.0",
37
+ "opencv-python>=4.10",
38
+ "scipy>=1.11",
39
+ ]
40
+ dev = [
41
+ "build>=1.2",
42
+ "ipykernel>=6.29",
43
+ "notebook>=7.0",
44
+ "pip-tools>=7.4",
45
+ "pytest>=8.0",
46
+ "ruff>=0.6",
47
+ ]
48
+
49
+ [tool.setuptools]
50
+ package-dir = {"" = "src"}
51
+ include-package-data = true
52
+
53
+ [tool.setuptools.packages.find]
54
+ where = ["src"]
55
+ include = ["drlab*"]
drlab-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,36 @@
1
+ from drlab.controllers import (
2
+ Controller,
3
+ EpsilonGreedyController,
4
+ GreedyController,
5
+ StochasticController,
6
+ )
7
+ from drlab.experiments import (
8
+ ActorCriticExperiment,
9
+ ActorCriticExperimentConfig,
10
+ DQNExperiment,
11
+ DQNExperimentConfig,
12
+ )
13
+ from drlab.learners import ActorCritic, ActorCriticConfig, DQN, DQNConfig
14
+ from drlab.replay import ReplayBuffer, TransitionBatch
15
+ from drlab.runners import Runner
16
+
17
+ __version__ = "0.1.0"
18
+
19
+ __all__ = [
20
+ "__version__",
21
+ "ActorCritic",
22
+ "ActorCriticConfig",
23
+ "ActorCriticExperiment",
24
+ "ActorCriticExperimentConfig",
25
+ "Controller",
26
+ "DQN",
27
+ "DQNConfig",
28
+ "DQNExperiment",
29
+ "DQNExperimentConfig",
30
+ "EpsilonGreedyController",
31
+ "GreedyController",
32
+ "ReplayBuffer",
33
+ "Runner",
34
+ "StochasticController",
35
+ "TransitionBatch",
36
+ ]
@@ -0,0 +1,6 @@
1
+ from .base import Controller
2
+ from .greedy import GreedyController
3
+ from .e_greedy import EpsilonGreedyController
4
+ from .stochastic_controller import StochasticController
5
+
6
+ __all__ = ["Controller", "GreedyController", "EpsilonGreedyController", "StochasticController"]
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ import torch as th
4
+ from abc import ABC, abstractmethod
5
+
6
+ class Controller(ABC):
7
+ """Abstract controller interface."""
8
+
9
+ num_actions: int
10
+ model: th.nn.Module = None
11
+ controller: Controller = None
12
+
13
+ @abstractmethod
14
+ def choose(self, obs: th.Tensor, **kwargs) -> th.Tensor: ...
15
+
16
+ @abstractmethod
17
+ def probabilities(self, obs: th.Tensor, **kwargs) -> th.Tensor: ...
@@ -0,0 +1,46 @@
1
+ import numpy as np
2
+ import torch as th
3
+ from .base import Controller
4
+
5
+ class EpsilonGreedyController(Controller):
6
+
7
+ def __init__(
8
+ self,
9
+ controller: Controller,
10
+ num_actions: int,
11
+ max_eps: float = 1.0,
12
+ min_eps: float = 0.1,
13
+ anneal_steps: int = 10_000,
14
+ ):
15
+ self.controller = controller
16
+ self.num_actions = num_actions
17
+ self.model = controller.model
18
+ self.max_eps = max_eps
19
+ self.min_eps = min_eps
20
+ self.anneal_steps = anneal_steps
21
+ self.num_decisions = 0
22
+
23
+ if anneal_steps <= 1:
24
+ raise ValueError("anneal_steps must be >= 2")
25
+
26
+ def epsilon(self) -> float:
27
+ frac = max(1 - self.num_decisions / (self.anneal_steps - 1), 0.0)
28
+ return frac * (self.max_eps - self.min_eps) + self.min_eps
29
+
30
+ def choose(self, obs: th.Tensor, increase_counter: bool = True, **kwargs) -> th.Tensor:
31
+ eps = self.epsilon()
32
+ if increase_counter:
33
+ self.num_decisions += 1
34
+
35
+ B = obs.shape[0] if obs.ndim > 1 else 1
36
+ if np.random.rand() < eps:
37
+ return th.randint(self.num_actions, (B,), device=obs.device, dtype=th.long)
38
+
39
+ return self.controller.choose(obs, **kwargs)
40
+
41
+ def probabilities(self, obs: th.Tensor, **kwargs) -> th.Tensor:
42
+ eps = self.epsilon()
43
+ greedy = self.controller.probabilities(obs, **kwargs) # one-hot on argmax, shape [B,A]
44
+ B = greedy.shape[0]
45
+ uniform = th.full((B, self.num_actions), eps / self.num_actions, device=greedy.device)
46
+ return uniform + (1 - eps) * greedy