tensor-optix 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensor_optix-0.1.0/.gitignore +53 -0
- tensor_optix-0.1.0/LICENSE +21 -0
- tensor_optix-0.1.0/PKG-INFO +299 -0
- tensor_optix-0.1.0/PLAN.md +268 -0
- tensor_optix-0.1.0/README.md +259 -0
- tensor_optix-0.1.0/pyproject.toml +42 -0
- tensor_optix-0.1.0/tensor_optix/__init__.py +29 -0
- tensor_optix-0.1.0/tensor_optix/adapters/__init__.py +0 -0
- tensor_optix-0.1.0/tensor_optix/adapters/tensorflow/__init__.py +4 -0
- tensor_optix-0.1.0/tensor_optix/adapters/tensorflow/tf_agent.py +130 -0
- tensor_optix-0.1.0/tensor_optix/adapters/tensorflow/tf_evaluator.py +86 -0
- tensor_optix-0.1.0/tensor_optix/core/__init__.py +0 -0
- tensor_optix-0.1.0/tensor_optix/core/backoff_scheduler.py +114 -0
- tensor_optix-0.1.0/tensor_optix/core/base_agent.py +57 -0
- tensor_optix-0.1.0/tensor_optix/core/base_evaluator.py +40 -0
- tensor_optix-0.1.0/tensor_optix/core/base_optimizer.py +45 -0
- tensor_optix-0.1.0/tensor_optix/core/base_pipeline.py +42 -0
- tensor_optix-0.1.0/tensor_optix/core/checkpoint_registry.py +159 -0
- tensor_optix-0.1.0/tensor_optix/core/loop_controller.py +238 -0
- tensor_optix-0.1.0/tensor_optix/core/types.py +95 -0
- tensor_optix-0.1.0/tensor_optix/optimizer.py +105 -0
- tensor_optix-0.1.0/tensor_optix/optimizers/__init__.py +4 -0
- tensor_optix-0.1.0/tensor_optix/optimizers/backoff_optimizer.py +140 -0
- tensor_optix-0.1.0/tensor_optix/optimizers/pbt_optimizer.py +117 -0
- tensor_optix-0.1.0/tensor_optix/pipeline/__init__.py +4 -0
- tensor_optix-0.1.0/tensor_optix/pipeline/batch_pipeline.py +93 -0
- tensor_optix-0.1.0/tensor_optix/pipeline/live_pipeline.py +161 -0
- tensor_optix-0.1.0/tests/conftest.py +135 -0
- tensor_optix-0.1.0/tests/test_adapters/__init__.py +0 -0
- tensor_optix-0.1.0/tests/test_adapters/test_tf_agent.py +114 -0
- tensor_optix-0.1.0/tests/test_adapters/test_tf_evaluator.py +81 -0
- tensor_optix-0.1.0/tests/test_core/__init__.py +0 -0
- tensor_optix-0.1.0/tests/test_core/test_backoff_scheduler.py +111 -0
- tensor_optix-0.1.0/tests/test_core/test_checkpoint_registry.py +78 -0
- tensor_optix-0.1.0/tests/test_core/test_loop_controller.py +107 -0
- tensor_optix-0.1.0/tests/test_core/test_types.py +54 -0
- tensor_optix-0.1.0/tests/test_integration/__init__.py +0 -0
- tensor_optix-0.1.0/tests/test_integration/test_end_to_end.py +164 -0
- tensor_optix-0.1.0/tests/test_optimizers/__init__.py +0 -0
- tensor_optix-0.1.0/tests/test_optimizers/test_backoff_optimizer.py +124 -0
- tensor_optix-0.1.0/tests/test_optimizers/test_pbt_optimizer.py +87 -0
- tensor_optix-0.1.0/tests/test_pipeline/__init__.py +0 -0
- tensor_optix-0.1.0/tests/test_pipeline/test_batch_pipeline.py +81 -0
- tensor_optix-0.1.0/tests/test_pipeline/test_live_pipeline.py +93 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
*.egg
|
|
7
|
+
*.egg-info/
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
eggs/
|
|
11
|
+
parts/
|
|
12
|
+
var/
|
|
13
|
+
sdist/
|
|
14
|
+
develop-eggs/
|
|
15
|
+
.installed.cfg
|
|
16
|
+
lib/
|
|
17
|
+
lib64/
|
|
18
|
+
|
|
19
|
+
# Virtual environments
|
|
20
|
+
.venv/
|
|
21
|
+
venv/
|
|
22
|
+
ENV/
|
|
23
|
+
env/
|
|
24
|
+
|
|
25
|
+
# uv lock (libraries don't pin lock files)
|
|
26
|
+
uv.lock
|
|
27
|
+
|
|
28
|
+
# Testing
|
|
29
|
+
.pytest_cache/
|
|
30
|
+
.coverage
|
|
31
|
+
coverage.xml
|
|
32
|
+
htmlcov/
|
|
33
|
+
.tox/
|
|
34
|
+
|
|
35
|
+
# Type checking
|
|
36
|
+
.mypy_cache/
|
|
37
|
+
.dmypy.json
|
|
38
|
+
|
|
39
|
+
# IDEs
|
|
40
|
+
.vscode/
|
|
41
|
+
.idea/
|
|
42
|
+
*.swp
|
|
43
|
+
*.swo
|
|
44
|
+
*~
|
|
45
|
+
|
|
46
|
+
# OS
|
|
47
|
+
.DS_Store
|
|
48
|
+
Thumbs.db
|
|
49
|
+
|
|
50
|
+
# Project-specific
|
|
51
|
+
tensor_optix_checkpoints/
|
|
52
|
+
*.keras
|
|
53
|
+
*.h5
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 sup3rus3r
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tensor-optix
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Autonomous continuous learning loop for TensorFlow RL agents
|
|
5
|
+
Author: sup3rus3r
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 sup3rus3r
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Python: >=3.11
|
|
29
|
+
Requires-Dist: gymnasium>=1.0.0
|
|
30
|
+
Requires-Dist: numpy>=1.24.0
|
|
31
|
+
Requires-Dist: tensorflow>=2.18.0
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: black; extra == 'dev'
|
|
34
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
38
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# tensor-optix
|
|
42
|
+
|
|
43
|
+
Autonomous continuous learning loop for TensorFlow RL agents.
|
|
44
|
+
|
|
45
|
+
> **We own the loop. You own the model.**
|
|
46
|
+
|
|
47
|
+
tensor-optix wraps your TensorFlow model and Gymnasium environment and takes full ownership of the training loop — stepping continuously, evaluating performance windows, tuning hyperparameters, checkpointing, and adapting over time without manual intervention.
|
|
48
|
+
|
|
49
|
+
**No fixed episodes.** Training runs as a continuous stream of steps. The loop determines when training ends — not the environment's `done` flag.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install tensor-optix
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**Requirements:** Python >= 3.11, TensorFlow >= 2.18, Gymnasium >= 1.0
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import tensorflow as tf
|
|
67
|
+
import gymnasium as gym
|
|
68
|
+
from tensor_optix import RLOptimizer, TFAgent, BatchPipeline, HyperparamSet
|
|
69
|
+
|
|
70
|
+
# Build your model normally
|
|
71
|
+
model = tf.keras.Sequential([
|
|
72
|
+
tf.keras.layers.Input(shape=(4,)),
|
|
73
|
+
tf.keras.layers.Dense(64, activation="relu"),
|
|
74
|
+
tf.keras.layers.Dense(64, activation="relu"),
|
|
75
|
+
tf.keras.layers.Dense(2),
|
|
76
|
+
])
|
|
77
|
+
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
|
|
78
|
+
|
|
79
|
+
agent = TFAgent(
|
|
80
|
+
model=model,
|
|
81
|
+
optimizer=optimizer,
|
|
82
|
+
hyperparams=HyperparamSet(
|
|
83
|
+
params={"learning_rate": 3e-4, "gamma": 0.99},
|
|
84
|
+
episode_id=0,
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Continuous stepping — windows of 200 steps, no forced resets
|
|
89
|
+
env = gym.make("CartPole-v1")
|
|
90
|
+
pipeline = BatchPipeline(env=env, agent=agent, window_size=200)
|
|
91
|
+
|
|
92
|
+
opt = RLOptimizer(agent=agent, pipeline=pipeline)
|
|
93
|
+
opt.run() # runs until DORMANT (plateau) or max_episodes
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## How It Works
|
|
99
|
+
|
|
100
|
+
tensor-optix runs an autonomous improvement loop with four states:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
ACTIVE → aggressive tuning, evaluates every window
|
|
104
|
+
COOLING → recent improvement, exponential backoff on eval frequency
|
|
105
|
+
DORMANT → plateau reached — model is trained, minimal intervention
|
|
106
|
+
WATCHDOG → monitoring for degradation
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**DORMANT = trained.** The backoff determines when the model can no longer improve, not a fixed episode count.
|
|
110
|
+
|
|
111
|
+
The loop:
|
|
112
|
+
1. Steps continuously through the environment in fixed-size windows
|
|
113
|
+
2. Evaluates each window via `primary_score`
|
|
114
|
+
3. If improved: saves checkpoint, resets backoff
|
|
115
|
+
4. If plateau: backs off evaluation, eventually reaches DORMANT
|
|
116
|
+
5. If degraded: optionally rolls back to best checkpoint, re-activates
|
|
117
|
+
6. Tunes hyperparameters using two-phase finite difference
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Optimizer — Two-Phase Finite Difference
|
|
122
|
+
|
|
123
|
+
`BackoffOptimizer` uses staggered two-phase finite difference per param:
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Phase 1 (probe): apply θᵢ + δᵢ, run one window
|
|
127
|
+
Phase 2 (commit): gradient = (score_after - score_before) / δᵢ
|
|
128
|
+
if gradient > 0: keep θᵢ + δᵢ
|
|
129
|
+
if gradient < 0: apply θᵢ - δᵢ (reverse)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Params are cycled round-robin. Each param is probed and committed independently. Step size adapts on improvement and plateau.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from tensor_optix import BackoffOptimizer
|
|
136
|
+
|
|
137
|
+
opt = RLOptimizer(
|
|
138
|
+
agent=agent,
|
|
139
|
+
pipeline=pipeline,
|
|
140
|
+
optimizer=BackoffOptimizer(
|
|
141
|
+
param_bounds={
|
|
142
|
+
"learning_rate": (1e-5, 1e-2),
|
|
143
|
+
"gamma": (0.9, 0.999),
|
|
144
|
+
},
|
|
145
|
+
perturbation_scale=0.05,
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### PBTOptimizer
|
|
151
|
+
|
|
152
|
+
Pseudo population-based training. Maintains a history of `(hyperparams, score)` pairs. Exploits top performers when in the bottom 20%, explores otherwise.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from tensor_optix import PBTOptimizer
|
|
156
|
+
|
|
157
|
+
opt = RLOptimizer(
|
|
158
|
+
agent=agent,
|
|
159
|
+
pipeline=pipeline,
|
|
160
|
+
optimizer=PBTOptimizer(
|
|
161
|
+
param_bounds={"learning_rate": (1e-5, 1e-2)},
|
|
162
|
+
history_size=50,
|
|
163
|
+
),
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Custom Evaluator
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from tensor_optix import BaseEvaluator, EpisodeData, EvalMetrics
|
|
173
|
+
|
|
174
|
+
class TotalRewardEvaluator(BaseEvaluator):
|
|
175
|
+
def score(self, episode_data: EpisodeData, train_diagnostics: dict) -> EvalMetrics:
|
|
176
|
+
total = sum(episode_data.rewards)
|
|
177
|
+
return EvalMetrics(
|
|
178
|
+
primary_score=total,
|
|
179
|
+
metrics={"total_reward": total},
|
|
180
|
+
episode_id=episode_data.episode_id,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
opt = RLOptimizer(agent=agent, pipeline=pipeline, evaluator=TotalRewardEvaluator())
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Custom Agent (Algorithm-Specific Learning)
|
|
189
|
+
|
|
190
|
+
`TFAgent` provides a REINFORCE baseline. Subclass and override `learn()` for PPO, SAC, DQN, etc.:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from tensor_optix import TFAgent
|
|
194
|
+
from tensor_optix.core.types import EpisodeData
|
|
195
|
+
import tensorflow as tf
|
|
196
|
+
|
|
197
|
+
class PPOAgent(TFAgent):
|
|
198
|
+
def learn(self, episode_data: EpisodeData) -> dict:
|
|
199
|
+
clip_ratio = self._hyperparams.params.get("clip_ratio", 0.2)
|
|
200
|
+
# ... PPO update logic ...
|
|
201
|
+
return {"loss": loss_value, "entropy": entropy_value}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Live Pipeline
|
|
207
|
+
|
|
208
|
+
For real-time data sources (trading, robotics, online environments):
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from tensor_optix import LivePipeline
|
|
212
|
+
|
|
213
|
+
class MyFeed:
|
|
214
|
+
def stream(self):
|
|
215
|
+
while True:
|
|
216
|
+
yield obs, reward, terminated, truncated, info
|
|
217
|
+
|
|
218
|
+
pipeline = LivePipeline(
|
|
219
|
+
data_source=MyFeed(),
|
|
220
|
+
agent=agent,
|
|
221
|
+
episode_boundary_fn=LivePipeline.every_n_seconds(300),
|
|
222
|
+
)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
## Callbacks
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from tensor_optix import LoopCallback
|
|
231
|
+
|
|
232
|
+
class MyLogger(LoopCallback):
|
|
233
|
+
def on_improvement(self, snapshot):
|
|
234
|
+
print(f"New best: {snapshot.eval_metrics.primary_score:.4f}")
|
|
235
|
+
|
|
236
|
+
def on_dormant(self, window_id):
|
|
237
|
+
print(f"Training complete at window {window_id}")
|
|
238
|
+
|
|
239
|
+
opt = RLOptimizer(agent=agent, pipeline=pipeline, callbacks=[MyLogger()])
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Available hooks: `on_loop_start`, `on_loop_stop`, `on_episode_end`, `on_improvement`, `on_plateau`, `on_dormant`, `on_degradation`, `on_hyperparam_update`.
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Full Configuration
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
opt = RLOptimizer(
|
|
250
|
+
agent=agent,
|
|
251
|
+
pipeline=pipeline,
|
|
252
|
+
evaluator=None, # default: TFEvaluator
|
|
253
|
+
optimizer=None, # default: BackoffOptimizer
|
|
254
|
+
checkpoint_dir="./checkpoints",
|
|
255
|
+
max_snapshots=10,
|
|
256
|
+
rollback_on_degradation=False,
|
|
257
|
+
improvement_margin=0.0,
|
|
258
|
+
max_episodes=None, # None = run until DORMANT
|
|
259
|
+
base_interval=1,
|
|
260
|
+
backoff_factor=2.0,
|
|
261
|
+
max_interval_episodes=100,
|
|
262
|
+
plateau_threshold=5,
|
|
263
|
+
dormant_threshold=20,
|
|
264
|
+
degradation_threshold=0.95,
|
|
265
|
+
callbacks=[],
|
|
266
|
+
)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Architecture
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
tensor_optix/
|
|
275
|
+
├── core/
|
|
276
|
+
│ ├── types.py # EpisodeData, EvalMetrics, HyperparamSet, LoopState
|
|
277
|
+
│ ├── base_agent.py # BaseAgent — 6-method contract
|
|
278
|
+
│ ├── base_evaluator.py
|
|
279
|
+
│ ├── base_optimizer.py
|
|
280
|
+
│ ├── base_pipeline.py
|
|
281
|
+
│ ├── loop_controller.py # State machine + main loop
|
|
282
|
+
│ ├── checkpoint_registry.py
|
|
283
|
+
│ └── backoff_scheduler.py
|
|
284
|
+
├── adapters/tensorflow/
|
|
285
|
+
│ ├── tf_agent.py # TFAgent — Keras model wrapper
|
|
286
|
+
│ └── tf_evaluator.py # TFEvaluator — default scorer
|
|
287
|
+
├── pipeline/
|
|
288
|
+
│ ├── batch_pipeline.py # Continuous stepping, fixed windows
|
|
289
|
+
│ └── live_pipeline.py # Real-time streaming
|
|
290
|
+
└── optimizers/
|
|
291
|
+
├── backoff_optimizer.py # Two-phase finite difference
|
|
292
|
+
└── pbt_optimizer.py # Pseudo population-based training
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT — Copyright (c) 2026 sup3rus3r
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# tensor-optix — Living Implementation Plan
|
|
2
|
+
|
|
3
|
+
> This document is the single source of truth for building tensor-optix.
|
|
4
|
+
> Update it as decisions are made, issues are found, and tasks complete.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Project Identity
|
|
9
|
+
|
|
10
|
+
- **Package name:** `tensor-optix`
|
|
11
|
+
- **Import name:** `tensor_optix`
|
|
12
|
+
- **Root directory:** `d:\development\AugData\tensor-optix\`
|
|
13
|
+
- **Python:** `>=3.11`
|
|
14
|
+
- **Framework:** TensorFlow `>=2.18.0` (TF only, no framework abstraction)
|
|
15
|
+
- **Environment API:** Gymnasium `>=1.0.0` (modern API: `terminated | truncated`, not `done`)
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## What This Is
|
|
20
|
+
|
|
21
|
+
A PyPI-distributable Python library that replaces the conventional RL training loop with an autonomous, continuously-learning optimization system. The user builds their TF model and Gymnasium environment. The library owns the training loop, evaluation, hyperparameter tuning, checkpointing, and adaptation lifecycle.
|
|
22
|
+
|
|
23
|
+
**Core philosophy:** We own the loop. The user owns the model.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Architecture Summary
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
RLOptimizer (main entry point)
|
|
31
|
+
└── LoopController (state machine + loop orchestration)
|
|
32
|
+
├── BaseAgent ← user implements this
|
|
33
|
+
├── BaseEvaluator ← user implements or use TFEvaluator default
|
|
34
|
+
├── BaseOptimizer ← BackoffOptimizer or PBTOptimizer
|
|
35
|
+
├── BasePipeline ← BatchPipeline or LivePipeline
|
|
36
|
+
├── CheckpointRegistry ← snapshot storage
|
|
37
|
+
└── BackoffScheduler ← interval + state management
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Loop States
|
|
41
|
+
| State | Behavior |
|
|
42
|
+
|-------|----------|
|
|
43
|
+
| ACTIVE | Aggressive tuning, eval every episode |
|
|
44
|
+
| COOLING | Recent improvement, exponential backoff |
|
|
45
|
+
| DORMANT | Plateau, minimal intervention |
|
|
46
|
+
| WATCHDOG | Monitoring for degradation |
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Repository Structure
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
tensor-optix/
|
|
54
|
+
├── PLAN.md ← this file
|
|
55
|
+
├── pyproject.toml
|
|
56
|
+
├── README.md
|
|
57
|
+
├── LICENSE
|
|
58
|
+
│
|
|
59
|
+
├── tensor_optix/
|
|
60
|
+
│ ├── __init__.py # Public API surface
|
|
61
|
+
│ │
|
|
62
|
+
│ ├── core/
|
|
63
|
+
│ │ ├── __init__.py
|
|
64
|
+
│ │ ├── types.py # EpisodeData, EvalMetrics, HyperparamSet, PolicySnapshot, LoopState
|
|
65
|
+
│ │ ├── base_agent.py # Abstract BaseAgent
|
|
66
|
+
│ │ ├── base_evaluator.py # Abstract BaseEvaluator
|
|
67
|
+
│ │ ├── base_optimizer.py # Abstract BaseOptimizer
|
|
68
|
+
│ │ ├── base_pipeline.py # Abstract BasePipeline + EpisodeBoundaryFn
|
|
69
|
+
│ │ ├── loop_controller.py # LoopController + LoopCallback
|
|
70
|
+
│ │ ├── checkpoint_registry.py # CheckpointRegistry
|
|
71
|
+
│ │ └── backoff_scheduler.py # BackoffScheduler
|
|
72
|
+
│ │
|
|
73
|
+
│ ├── adapters/
|
|
74
|
+
│ │ ├── __init__.py
|
|
75
|
+
│ │ └── tensorflow/
|
|
76
|
+
│ │ ├── __init__.py
|
|
77
|
+
│ │ ├── tf_agent.py # TFAgent(BaseAgent)
|
|
78
|
+
│ │ └── tf_evaluator.py # TFEvaluator(BaseEvaluator)
|
|
79
|
+
│ │
|
|
80
|
+
│ ├── pipeline/
|
|
81
|
+
│ │ ├── __init__.py
|
|
82
|
+
│ │ ├── batch_pipeline.py # BatchPipeline — Gymnasium env, static/episodic
|
|
83
|
+
│ │ └── live_pipeline.py # LivePipeline — real-time streaming source
|
|
84
|
+
│ │
|
|
85
|
+
│ └── optimizers/
|
|
86
|
+
│ ├── __init__.py
|
|
87
|
+
│ ├── backoff_optimizer.py # BackoffOptimizer (default, perturbation-based)
|
|
88
|
+
│ └── pbt_optimizer.py # PBTOptimizer (pseudo population-based training)
|
|
89
|
+
│
|
|
90
|
+
└── tests/
|
|
91
|
+
├── conftest.py
|
|
92
|
+
├── test_core/
|
|
93
|
+
│ ├── test_types.py
|
|
94
|
+
│ ├── test_backoff_scheduler.py
|
|
95
|
+
│ ├── test_checkpoint_registry.py
|
|
96
|
+
│ └── test_loop_controller.py
|
|
97
|
+
├── test_adapters/
|
|
98
|
+
│ ├── test_tf_agent.py
|
|
99
|
+
│ └── test_tf_evaluator.py
|
|
100
|
+
├── test_pipeline/
|
|
101
|
+
│ ├── test_batch_pipeline.py
|
|
102
|
+
│ └── test_live_pipeline.py
|
|
103
|
+
├── test_optimizers/
|
|
104
|
+
│ ├── test_backoff_optimizer.py
|
|
105
|
+
│ └── test_pbt_optimizer.py
|
|
106
|
+
└── test_integration/
|
|
107
|
+
└── test_end_to_end.py
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Critical Rules (never violate)
|
|
113
|
+
|
|
114
|
+
1. **Gymnasium API only.** `env.reset()` → `(obs, info)`. `env.step()` → `(obs, reward, terminated, truncated, info)`. Never use legacy `done` flag internally — merge `terminated | truncated` at the pipeline boundary.
|
|
115
|
+
2. **`BaseAgent` is the only contract.** `LoopController` calls only: `act()`, `learn()`, `get_hyperparams()`, `set_hyperparams()`, `save_weights()`, `load_weights()`.
|
|
116
|
+
3. **`HyperparamSet.params` is an open dict.** Core never reads specific key names. Opaque blob passed between optimizer and agent.
|
|
117
|
+
4. **`EpisodeData` carries raw interaction data only.** No algorithm-specific fields.
|
|
118
|
+
5. **No algorithm-specific code in `core/` or `loop_controller.py`.** PPO, DQN, SAC, etc. are never referenced there.
|
|
119
|
+
6. **`LoopController` is algorithm-blind.** run episode → get score → compare → tune → repeat.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Implementation Tasks
|
|
124
|
+
|
|
125
|
+
### Phase 1 — Core Foundation
|
|
126
|
+
- [ ] `pyproject.toml`
|
|
127
|
+
- [ ] `tensor_optix/core/types.py`
|
|
128
|
+
- [ ] `tensor_optix/core/base_agent.py`
|
|
129
|
+
- [ ] `tensor_optix/core/base_evaluator.py`
|
|
130
|
+
- [ ] `tensor_optix/core/base_optimizer.py`
|
|
131
|
+
- [ ] `tensor_optix/core/base_pipeline.py`
|
|
132
|
+
- [ ] `tensor_optix/core/backoff_scheduler.py`
|
|
133
|
+
- [ ] `tensor_optix/core/checkpoint_registry.py`
|
|
134
|
+
- [ ] `tensor_optix/core/loop_controller.py`
|
|
135
|
+
|
|
136
|
+
### Phase 2 — TensorFlow Adapter
|
|
137
|
+
- [ ] `tensor_optix/adapters/tensorflow/tf_agent.py`
|
|
138
|
+
- [ ] `tensor_optix/adapters/tensorflow/tf_evaluator.py`
|
|
139
|
+
|
|
140
|
+
### Phase 3 — Pipelines
|
|
141
|
+
- [ ] `tensor_optix/pipeline/batch_pipeline.py`
|
|
142
|
+
- [ ] `tensor_optix/pipeline/live_pipeline.py`
|
|
143
|
+
|
|
144
|
+
### Phase 4 — Optimizers
|
|
145
|
+
- [ ] `tensor_optix/optimizers/backoff_optimizer.py`
|
|
146
|
+
- [ ] `tensor_optix/optimizers/pbt_optimizer.py`
|
|
147
|
+
|
|
148
|
+
### Phase 5 — Wiring
|
|
149
|
+
- [ ] `tensor_optix/optimizer.py` (RLOptimizer entry point)
|
|
150
|
+
- [ ] `tensor_optix/__init__.py` (public API surface)
|
|
151
|
+
- [ ] All `core/__init__.py`, `adapters/__init__.py`, `pipeline/__init__.py`, `optimizers/__init__.py`
|
|
152
|
+
|
|
153
|
+
### Phase 6 — Tests
|
|
154
|
+
- [ ] `tests/conftest.py`
|
|
155
|
+
- [ ] `tests/test_core/test_types.py`
|
|
156
|
+
- [ ] `tests/test_core/test_backoff_scheduler.py`
|
|
157
|
+
- [ ] `tests/test_core/test_checkpoint_registry.py`
|
|
158
|
+
- [ ] `tests/test_core/test_loop_controller.py`
|
|
159
|
+
- [ ] `tests/test_adapters/test_tf_agent.py`
|
|
160
|
+
- [ ] `tests/test_adapters/test_tf_evaluator.py`
|
|
161
|
+
- [ ] `tests/test_pipeline/test_batch_pipeline.py`
|
|
162
|
+
- [ ] `tests/test_pipeline/test_live_pipeline.py`
|
|
163
|
+
- [ ] `tests/test_optimizers/test_backoff_optimizer.py`
|
|
164
|
+
- [ ] `tests/test_optimizers/test_pbt_optimizer.py`
|
|
165
|
+
- [ ] `tests/test_integration/test_end_to_end.py`
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Known Issues / Decisions Log
|
|
170
|
+
|
|
171
|
+
| Date | Issue | Decision |
|
|
172
|
+
|------|-------|----------|
|
|
173
|
+
| 2026-03-27 | Blueprint said "framework-agnostic" | Corrected: TensorFlow only |
|
|
174
|
+
| 2026-03-27 | Blueprint used legacy gym API | Corrected: Gymnasium >=1.0.0 |
|
|
175
|
+
| 2026-03-27 | Blueprint hardcoded TF as required dep in a "framework-agnostic" core | N/A — TF-only removes the contradiction |
|
|
176
|
+
| 2026-03-27 | Degradation check `score < best * threshold` breaks for negative scores | Fixed: use `score < best - abs(best) * (1 - threshold)` |
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Notes
|
|
181
|
+
|
|
182
|
+
- `BatchPipeline` wraps a Gymnasium-compatible env for episodic/batch training. Not a static dataset loader.
|
|
183
|
+
- `LivePipeline` wraps a streaming data source (e.g. websocket feed). User provides a `stream()` generator.
|
|
184
|
+
- `TFAgent.learn()` provides a generic gradient update baseline. Users subclass and override for specific algorithms (PPO clipping, SAC entropy tuning, etc.).
|
|
185
|
+
- `PBTOptimizer` approximates population-based training for single-agent use via a virtual population from history.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Optimizer Math — BackoffOptimizer (Running Finite Difference)
|
|
190
|
+
|
|
191
|
+
### Core Idea
|
|
192
|
+
Estimate the gradient of `primary_score` w.r.t. each hyperparam using finite differences accumulated across episodes. Step in the direction that increases score.
|
|
193
|
+
|
|
194
|
+
### Per-param gradient estimate
|
|
195
|
+
```
|
|
196
|
+
∂score/∂θᵢ ≈ (score_avg_after - score_avg_before) / Δθᵢ
|
|
197
|
+
```
|
|
198
|
+
Where `score_avg` is a rolling mean over the last N episodes (noise reduction).
|
|
199
|
+
|
|
200
|
+
### Update rule
|
|
201
|
+
```
|
|
202
|
+
θᵢ_new = clip(θᵢ + α * ∂score/∂θᵢ, low_bound, high_bound)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Step size α (adaptive)
|
|
206
|
+
```
|
|
207
|
+
α = base_lr / (1 + β * score_variance)
|
|
208
|
+
```
|
|
209
|
+
High variance in recent scores → smaller steps. Low variance → larger steps.
|
|
210
|
+
|
|
211
|
+
### Perturbation size δ (per param)
|
|
212
|
+
- Multiplicative: `δᵢ = perturbation_scale * |θᵢ|` (scale-invariant)
|
|
213
|
+
- Clamped: `δᵢ = max(δᵢ, min_delta)` to avoid zero delta on small params
|
|
214
|
+
|
|
215
|
+
### Directional memory
|
|
216
|
+
- Track last direction moved per param (`+1` or `-1`)
|
|
217
|
+
- Track whether that move improved score
|
|
218
|
+
- If improvement: continue in same direction (momentum)
|
|
219
|
+
- If no improvement: reverse direction, halve step size
|
|
220
|
+
|
|
221
|
+
### Score buffer
|
|
222
|
+
- Rolling window of last `score_window` (default: 5) primary scores
|
|
223
|
+
- Use mean of buffer as the stable score signal for gradient estimation
|
|
224
|
+
- Do not update params until buffer has at least `min_samples` entries
|
|
225
|
+
|
|
226
|
+
### Bounds enforcement
|
|
227
|
+
- User provides `param_bounds: dict[str, tuple[float, float]]`
|
|
228
|
+
- Params not in bounds are left unchanged
|
|
229
|
+
- All updates clipped to `[low, high]` after step
|
|
230
|
+
|
|
231
|
+
### Variance-gated updates
|
|
232
|
+
- If `score_variance > high_variance_threshold`: skip update this cycle (too noisy to trust)
|
|
233
|
+
- Log skipped updates for observability
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Optimizer Math — PBTOptimizer (Pseudo Population-Based Training)
|
|
238
|
+
|
|
239
|
+
### Core Idea
|
|
240
|
+
Maintain a history of `(HyperparamSet, primary_score)` pairs as a virtual population. Use exploit/explore logic from PBT without parallel workers.
|
|
241
|
+
|
|
242
|
+
### Exploit condition
|
|
243
|
+
```
|
|
244
|
+
if current_score < percentile(history_scores, 20):
|
|
245
|
+
# bottom 20% — exploit top 20%
|
|
246
|
+
best_params = params from top 20% of history (by score)
|
|
247
|
+
new_params = perturb(best_params, scale=small)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Explore condition
|
|
251
|
+
```
|
|
252
|
+
else:
|
|
253
|
+
# not bottom 20% — explore
|
|
254
|
+
new_params = perturb(current_params, scale=medium)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Perturbation function (shared with BackoffOptimizer)
|
|
258
|
+
```
|
|
259
|
+
perturb(θ, scale) → for each param:
|
|
260
|
+
δ = scale * (high - low) # fraction of param range
|
|
261
|
+
new_val = θ + uniform(-δ, +δ)
|
|
262
|
+
new_val = clip(new_val, low, high)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### History management
|
|
266
|
+
- Keep last `history_size` (default: 50) `(params, score)` pairs
|
|
267
|
+
- FIFO eviction
|
|
268
|
+
- Percentile computed over this window only
|