research-pipelines 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- research_pipelines-0.1.0/LICENSE +21 -0
- research_pipelines-0.1.0/PKG-INFO +401 -0
- research_pipelines-0.1.0/README.md +355 -0
- research_pipelines-0.1.0/pyproject.toml +36 -0
- research_pipelines-0.1.0/setup.cfg +4 -0
- research_pipelines-0.1.0/src/research_pipelines/__init__.py +47 -0
- research_pipelines-0.1.0/src/research_pipelines/backends/__init__.py +7 -0
- research_pipelines-0.1.0/src/research_pipelines/backends/base.py +62 -0
- research_pipelines-0.1.0/src/research_pipelines/backends/manager.py +63 -0
- research_pipelines-0.1.0/src/research_pipelines/backends/pickle_backend.py +82 -0
- research_pipelines-0.1.0/src/research_pipelines/backends/wandb_backend.py +86 -0
- research_pipelines-0.1.0/src/research_pipelines/core.py +230 -0
- research_pipelines-0.1.0/src/research_pipelines/dag.py +188 -0
- research_pipelines-0.1.0/src/research_pipelines/decorators.py +389 -0
- research_pipelines-0.1.0/src/research_pipelines/query.py +394 -0
- research_pipelines-0.1.0/src/research_pipelines/visualize.py +89 -0
- research_pipelines-0.1.0/src/research_pipelines.egg-info/PKG-INFO +401 -0
- research_pipelines-0.1.0/src/research_pipelines.egg-info/SOURCES.txt +24 -0
- research_pipelines-0.1.0/src/research_pipelines.egg-info/dependency_links.txt +1 -0
- research_pipelines-0.1.0/src/research_pipelines.egg-info/requires.txt +14 -0
- research_pipelines-0.1.0/src/research_pipelines.egg-info/top_level.txt +1 -0
- research_pipelines-0.1.0/tests/test_backends.py +241 -0
- research_pipelines-0.1.0/tests/test_core.py +200 -0
- research_pipelines-0.1.0/tests/test_dag.py +316 -0
- research_pipelines-0.1.0/tests/test_decorators.py +455 -0
- research_pipelines-0.1.0/tests/test_integration.py +162 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 [fullname]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: research-pipelines
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight research project pipeline framework with DAG tracing
|
|
5
|
+
Author: Leander Kurscheidt
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 [fullname]
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
29
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
30
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
31
|
+
Classifier: Operating System :: OS Independent
|
|
32
|
+
Requires-Python: >=3.11
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Provides-Extra: example
|
|
36
|
+
Requires-Dist: torch>=2.2.0; extra == "example"
|
|
37
|
+
Provides-Extra: viz
|
|
38
|
+
Requires-Dist: matplotlib; extra == "viz"
|
|
39
|
+
Requires-Dist: networkx; extra == "viz"
|
|
40
|
+
Provides-Extra: wandb
|
|
41
|
+
Requires-Dist: wandb>=0.15.0; extra == "wandb"
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pytest>=9.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# Research Pipelines
|
|
48
|
+
|
|
49
|
+
A lightweight Python framework for tracing the DAG (directed acyclic graph) of research experiments. Automatically track datasets, models, and evaluations arguments and function-dependencies, then persist everything to wandb or local storage. This is especially useful for plotting of further evaluation of a trained model, as we can recreate the a function call or just the arguments of a traced function. By design, it is a pickle-free solution that relies on recording primitve arguments.
|
|
50
|
+
|
|
51
|
+
Just decorate function during training like:
|
|
52
|
+
```python
|
|
53
|
+
@evaluation()
|
|
54
|
+
def evaluate(model_obj, metric: str):
|
|
55
|
+
return {"score": 0.95}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
It turns a huge, messy notebook into something simple like:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# (select a traced run and load its saved configurations)
|
|
62
|
+
# rebuild the arguments such that we can call evaluate ourselves
|
|
63
|
+
model_obj, metric = query.build_arguments(
|
|
64
|
+
evaluate
|
|
65
|
+
)
|
|
66
|
+
# load saved weights
|
|
67
|
+
model_obj.load_state_dict(state_dict)
|
|
68
|
+
# call evaluate
|
|
69
|
+
evaluate(model_obj, metric)
|
|
70
|
+
# do some plotting
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
## Features
|
|
77
|
+
|
|
78
|
+
- **Automatic DAG Tracing**: Decorators automatically detect when traced objects are used as dependencies
|
|
79
|
+
- **Configuration Persistence**: Basic types (str, int, float, bool, None) are automatically captured and stored
|
|
80
|
+
- **Flexible Rebuilding**: The query backend allows for calling the traced functions again, even if they depend on other traced functions
|
|
81
|
+
- **Pluggable Backends**: Use PickleBackend for testing or WandBBackend for production wandb integration
|
|
82
|
+
- **Zero Boilerplate**: Apply decorators and your functions/classes are automatically traced
|
|
83
|
+
- **Recursive Dependency Resolution**: Full transitive closure of all dependencies
|
|
84
|
+
|
|
85
|
+
## Installation (Dev)
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Clone or create the project
|
|
89
|
+
cd research_pipelines
|
|
90
|
+
|
|
91
|
+
# Create conda environment
|
|
92
|
+
conda create -n research_pipelines python=3.11
|
|
93
|
+
|
|
94
|
+
# Activate environment
|
|
95
|
+
conda activate research_pipelines
|
|
96
|
+
|
|
97
|
+
# Install package in editable mode
|
|
98
|
+
pip install -e .
|
|
99
|
+
|
|
100
|
+
# Optional: Install the Torch example extra
|
|
101
|
+
pip install ".[example]"
|
|
102
|
+
|
|
103
|
+
# Optional: Install wandb backend
|
|
104
|
+
pip install ".[wandb]"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Quick Start
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from research_pipelines.decorators import dataset, model, evaluation
|
|
111
|
+
from research_pipelines.dag import build_dag
|
|
112
|
+
|
|
113
|
+
# Decorate your functions
|
|
114
|
+
@dataset()
|
|
115
|
+
def load_data(path: str, split: str):
|
|
116
|
+
# Load your data...
|
|
117
|
+
return {"data": [...], "metadata": {...}}
|
|
118
|
+
|
|
119
|
+
@model()
|
|
120
|
+
def train_model(train_data, architecture: str, lr: float):
|
|
121
|
+
# Non-basic args (train_data) become dependencies
|
|
122
|
+
# Basic args (architecture, lr) become config
|
|
123
|
+
return trained_model
|
|
124
|
+
|
|
125
|
+
@evaluation()
|
|
126
|
+
def evaluate(model_obj, metric: str):
|
|
127
|
+
return {"score": 0.95}
|
|
128
|
+
|
|
129
|
+
# Execute your pipeline
|
|
130
|
+
data = load_data(path="/data/train.csv", split="train")
|
|
131
|
+
model = train_model(train_data=data, architecture="bert", lr=0.001)
|
|
132
|
+
results = evaluate(model_obj=model, metric="accuracy")
|
|
133
|
+
|
|
134
|
+
# Print the DAG
|
|
135
|
+
dag = build_dag()
|
|
136
|
+
for obj_id, obj in dag.items():
|
|
137
|
+
print(f"{obj['type']}: {obj['config']}, depends on: {obj['dependencies']}")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Rebuild the traced object
|
|
141
|
+
The traced objects are not pickled, instead the arguments the functions are called with are saved.
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
import research_pipelines.query as query
|
|
145
|
+
|
|
146
|
+
# we can now easily call the functions with the recorded arguments via build()
|
|
147
|
+
dataset = query.build(
|
|
148
|
+
load_data
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# or just get the arguments such that we can call it ourselves
|
|
152
|
+
model_obj, metric = query.build_arguments(
|
|
153
|
+
evaluate
|
|
154
|
+
)
|
|
155
|
+
model_obj.load_state_dict(state_dict)
|
|
156
|
+
evaluate(model_obj, metric)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## How It Works
|
|
160
|
+
|
|
161
|
+
### 1. Decoration
|
|
162
|
+
|
|
163
|
+
Apply `@dataset()`, `@model()`, `@evaluation()`, or generic `@traced(traced_type="...")` to your functions or class constructors:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
@dataset()
|
|
167
|
+
def load_data(path: str, split: str):
|
|
168
|
+
return load_from_disk(path)
|
|
169
|
+
|
|
170
|
+
@model()
|
|
171
|
+
class MyModel:
|
|
172
|
+
def __init__(self, layers: int, dataset_input):
|
|
173
|
+
self.layers = layers
|
|
174
|
+
self.data = dataset_input
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### 2. Automatic Tracing
|
|
178
|
+
|
|
179
|
+
When you call a decorated function/constructor:
|
|
180
|
+
- **Arguments are classified**:
|
|
181
|
+
- **Basic types** (str, int, float, bool, None): stored as configuration
|
|
182
|
+
- **Traced objects** (returned from other @traced functions): become dependencies
|
|
183
|
+
- **Other types**: ignored (can be supplied manually later)
|
|
184
|
+
- **Unique ID** is generated for this object
|
|
185
|
+
- **Configuration** (basic args + type) is persisted to backend
|
|
186
|
+
- **Dependencies** (other traced object IDs) are recorded
|
|
187
|
+
|
|
188
|
+
### 3. DAG Structure
|
|
189
|
+
|
|
190
|
+
The framework automatically builds a DAG:
|
|
191
|
+
```
|
|
192
|
+
dataset_1 (config: path="/data/train.csv", split="train")
|
|
193
|
+
↓
|
|
194
|
+
model_1 (config: architecture="bert", lr=0.001, depends_on: [dataset_1])
|
|
195
|
+
↓
|
|
196
|
+
eval_1 (config: metric="accuracy", depends_on: [model_1])
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### 4. Backend Persistence
|
|
200
|
+
|
|
201
|
+
Choose a backend to persist configurations:
|
|
202
|
+
|
|
203
|
+
**PickleBackend** (default for testing):
|
|
204
|
+
```python
|
|
205
|
+
from research_pipelines.backends.pickle_backend import PickleBackend
|
|
206
|
+
from research_pipelines.backends.manager import set_backend
|
|
207
|
+
|
|
208
|
+
backend = PickleBackend(directory=".traced_configs")
|
|
209
|
+
set_backend(backend)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**WandBBackend** (for wandb integration):
|
|
213
|
+
```python
|
|
214
|
+
import wandb
|
|
215
|
+
from research_pipelines.backends.wandb_backend import WandBBackend
|
|
216
|
+
from research_pipelines.backends.manager import set_backend
|
|
217
|
+
|
|
218
|
+
wandb.init(project="my_project")
|
|
219
|
+
backend = WandBBackend()
|
|
220
|
+
set_backend(backend)
|
|
221
|
+
|
|
222
|
+
# Configs are automatically logged to wandb.run.config
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## API Reference
|
|
226
|
+
|
|
227
|
+
### Decorators
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from research_pipelines.decorators import dataset, model, evaluation, traced
|
|
231
|
+
|
|
232
|
+
@dataset()
|
|
233
|
+
def load_data(...):
|
|
234
|
+
"""Traces a dataset creation function/class."""
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
@model()
|
|
238
|
+
def train(...):
|
|
239
|
+
"""Traces a model creation function/class."""
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
@evaluation()
|
|
243
|
+
def eval(...):
|
|
244
|
+
"""Traces an evaluation function/class."""
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
@traced(traced_type="custom")
|
|
248
|
+
def my_function(...):
|
|
249
|
+
"""Generic tracer with custom type."""
|
|
250
|
+
pass
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### DAG Operations
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
from research_pipelines.dag import (
|
|
257
|
+
build_dag,
|
|
258
|
+
get_dependencies_recursive,
|
|
259
|
+
detect_circular_dependencies,
|
|
260
|
+
export_dag,
|
|
261
|
+
get_root_objects,
|
|
262
|
+
get_leaf_objects,
|
|
263
|
+
get_objects_by_type,
|
|
264
|
+
get_dependents,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Build full DAG
|
|
268
|
+
dag = build_dag()
|
|
269
|
+
|
|
270
|
+
# Get all transitive dependencies
|
|
271
|
+
deps = get_dependencies_recursive(object_id)
|
|
272
|
+
|
|
273
|
+
# Check for cycles
|
|
274
|
+
has_cycles = detect_circular_dependencies()
|
|
275
|
+
|
|
276
|
+
# Export for serialization
|
|
277
|
+
dag_export = export_dag()
|
|
278
|
+
|
|
279
|
+
# Find roots (datasets with no dependencies)
|
|
280
|
+
roots = get_root_objects()
|
|
281
|
+
|
|
282
|
+
# Find leaves (objects nothing depends on)
|
|
283
|
+
leaves = get_leaf_objects()
|
|
284
|
+
|
|
285
|
+
# Filter by type
|
|
286
|
+
datasets = get_objects_by_type("dataset")
|
|
287
|
+
models = get_objects_by_type("model")
|
|
288
|
+
|
|
289
|
+
# Find what depends on an object
|
|
290
|
+
dependents = get_dependents(object_id)
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### Backends
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
from research_pipelines.backends.manager import get_backend, set_backend
|
|
297
|
+
|
|
298
|
+
# Get active backend
|
|
299
|
+
backend = get_backend()
|
|
300
|
+
|
|
301
|
+
# Set custom backend
|
|
302
|
+
set_backend(my_backend)
|
|
303
|
+
|
|
304
|
+
# Backend interface
|
|
305
|
+
class Backend(ABC):
|
|
306
|
+
def log_config(object_id, config_dict, dependencies):
|
|
307
|
+
"""Persist config for an object."""
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
def get_config(object_id):
|
|
311
|
+
"""Retrieve config for an object."""
|
|
312
|
+
pass
|
|
313
|
+
|
|
314
|
+
def load_all():
|
|
315
|
+
"""Load all configs."""
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
def clear():
|
|
319
|
+
"""Clear all configs."""
|
|
320
|
+
pass
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## Configuration Format
|
|
324
|
+
|
|
325
|
+
Configurations are stored as dictionaries with the following structure:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
{
|
|
329
|
+
"object_id_1": {
|
|
330
|
+
"callable": "examples.simple_pipeline:load_dataset"
|
|
331
|
+
"config": {
|
|
332
|
+
"path": "/data/train.csv",
|
|
333
|
+
"split": "train",
|
|
334
|
+
"batch_size": 32,
|
|
335
|
+
},
|
|
336
|
+
"dependencies": [],
|
|
337
|
+
},
|
|
338
|
+
"object_id_2": {
|
|
339
|
+
"callable": "examples.simple_pipeline:create_model"
|
|
340
|
+
"config": {
|
|
341
|
+
"architecture": "bert",
|
|
342
|
+
"learning_rate": 0.001,
|
|
343
|
+
},
|
|
344
|
+
"dependencies": ["object_id_1"],
|
|
345
|
+
},
|
|
346
|
+
}
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
When using WandBBackend, this is stored directly in `wandb.run.config`.
|
|
350
|
+
|
|
351
|
+
## Examples
|
|
352
|
+
|
|
353
|
+
See [examples/simple_pipeline.py](examples/simple_pipeline.py) for a complete end-to-end example.
|
|
354
|
+
|
|
355
|
+
Run it:
|
|
356
|
+
```bash
|
|
357
|
+
conda activate research_pipelines
|
|
358
|
+
python examples/simple_pipeline.py
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
## Testing
|
|
362
|
+
|
|
363
|
+
All tests use PickleBackend and are fully isolated:
|
|
364
|
+
|
|
365
|
+
```bash
|
|
366
|
+
conda activate research_pipelines
|
|
367
|
+
pytest tests/ -v
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
## Development
|
|
371
|
+
|
|
372
|
+
The framework is organized into modules:
|
|
373
|
+
|
|
374
|
+
- `src/research_pipelines/core.py` - Core tracing logic
|
|
375
|
+
- `src/research_pipelines/decorators.py` - @dataset, @model, @evaluation decorators
|
|
376
|
+
- `src/research_pipelines/backends/` - Backend implementations
|
|
377
|
+
- `base.py` - Abstract Backend interface
|
|
378
|
+
- `pickle_backend.py` - PickleBackend (testing)
|
|
379
|
+
- `wandb_backend.py` - WandBBackend (wandb integration)
|
|
380
|
+
- `manager.py` - Global backend management
|
|
381
|
+
- `src/research_pipelines/dag.py` - DAG utilities
|
|
382
|
+
- `tests/` - Test suite (61 tests, all passing)
|
|
383
|
+
|
|
384
|
+
## Key Design Decisions
|
|
385
|
+
|
|
386
|
+
1. **Lazy Imports**: wandb is only imported when WandBBackend is used
|
|
387
|
+
2. **Automatic Dependency Detection**: Uses Python's `id()` to track object identity
|
|
388
|
+
3. **In-Memory Registry**: Separate from backend storage, enables DAG operations
|
|
389
|
+
4. **UUID v4 IDs**: Unique, collision-free object identifiers
|
|
390
|
+
5. **Type-Based Filtering**: Basic types automatically detected and persisted
|
|
391
|
+
6. **Pluggable Backends**: Easy to add custom storage implementations
|
|
392
|
+
|
|
393
|
+
## Limitations & Future Work
|
|
394
|
+
|
|
395
|
+
- No support for custom object serialization (by design)
|
|
396
|
+
- No execution timing/profiling (configuration-only tracking)
|
|
397
|
+
- No automatic versioning/hashing of objects
|
|
398
|
+
|
|
399
|
+
## License
|
|
400
|
+
|
|
401
|
+
MIT
|