taskcore-lib 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- taskcore_lib-0.0.2/LICENSE +21 -0
- taskcore_lib-0.0.2/PKG-INFO +244 -0
- taskcore_lib-0.0.2/README.md +221 -0
- taskcore_lib-0.0.2/setup.cfg +4 -0
- taskcore_lib-0.0.2/setup.py +26 -0
- taskcore_lib-0.0.2/taskcore/__init__.py +1 -0
- taskcore_lib-0.0.2/taskcore/base.py +233 -0
- taskcore_lib-0.0.2/taskcore/cli.py +39 -0
- taskcore_lib-0.0.2/taskcore/dist.py +53 -0
- taskcore_lib-0.0.2/taskcore_lib.egg-info/PKG-INFO +244 -0
- taskcore_lib-0.0.2/taskcore_lib.egg-info/SOURCES.txt +11 -0
- taskcore_lib-0.0.2/taskcore_lib.egg-info/dependency_links.txt +1 -0
- taskcore_lib-0.0.2/taskcore_lib.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Gonçalo Faria
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: taskcore-lib
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Package for basic task queue in the filesystem.
|
|
5
|
+
Home-page: https://github.com/goncalorafaria/taskcore
|
|
6
|
+
Author: Goncalo Faria
|
|
7
|
+
Author-email: gfaria@cs.washington.edu
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# TaskCore
|
|
25
|
+
|
|
26
|
+
A lightweight, filesystem-based task queue system for Python. TaskCore provides a simple way to distribute and manage tasks across multiple processes or machines using the filesystem as the backend.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **Simple Setup**: No external dependencies - uses only Python standard library
|
|
31
|
+
- **Filesystem-based**: Tasks are stored as JSON files on disk
|
|
32
|
+
- **Multi-process Support**: Built-in support for distributed task processing
|
|
33
|
+
- **Fault Tolerant**: Automatic task recovery and stale task reclamation
|
|
34
|
+
- **Atomic Operations**: Safe concurrent access using atomic file operations
|
|
35
|
+
- **SLURM Integration**: Works seamlessly with SLURM job schedulers
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install taskcore
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or install from source:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/goncalorafaria/taskcore
|
|
47
|
+
cd taskcore
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### Basic Usage
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from taskcore import FileSystemTaskQueueClient
|
|
57
|
+
|
|
58
|
+
# Initialize the task queue
|
|
59
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
60
|
+
|
|
61
|
+
# Add tasks to the queue
|
|
62
|
+
task_config = {
|
|
63
|
+
"learning_rate": 1e-4,
|
|
64
|
+
"batch_size": 32,
|
|
65
|
+
"epochs": 100,
|
|
66
|
+
"model_name": "bert-base-uncased"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
task_id = queue.add_task(task_config)
|
|
70
|
+
print(f"Added task with ID: {task_id}")
|
|
71
|
+
|
|
72
|
+
# Process tasks
|
|
73
|
+
def init_function(**config):
|
|
74
|
+
# Initialize your model/trainer here
|
|
75
|
+
trainer = create_trainer(**config)
|
|
76
|
+
return trainer, {"status": "initialized"}
|
|
77
|
+
|
|
78
|
+
def run_function(trainer):
|
|
79
|
+
# Run your training/processing here
|
|
80
|
+
trainer.train()
|
|
81
|
+
|
|
82
|
+
# Fetch and run tasks
|
|
83
|
+
while queue.fetch_and_run_task(init_func=init_function, func=run_function):
|
|
84
|
+
pass
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Multi-Process Example
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import os
|
|
91
|
+
from taskcore import FileSystemTaskQueueClient
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
# Use LOCAL_RANK for multi-process setups (e.g., with torch.distributed)
|
|
95
|
+
rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
96
|
+
queue = FileSystemTaskQueueClient("/path/to/tasks", rank=rank)
|
|
97
|
+
|
|
98
|
+
def init_func(**config):
|
|
99
|
+
# Initialize your distributed training setup
|
|
100
|
+
trainer = create_distributed_trainer(**config)
|
|
101
|
+
return trainer, {"wandb_link": trainer.wandb_link}
|
|
102
|
+
|
|
103
|
+
def run_func(trainer):
|
|
104
|
+
trainer.train()
|
|
105
|
+
|
|
106
|
+
# Process tasks until none are available
|
|
107
|
+
while queue.fetch_and_run_task(init_func=init_func, func=run_func):
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
main()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Task Generation Example
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from taskcore import FileSystemTaskQueueClient
|
|
118
|
+
|
|
119
|
+
# Initialize queue
|
|
120
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
121
|
+
|
|
122
|
+
# Define hyperparameter sweep
|
|
123
|
+
learning_rates = [1e-4, 1e-5, 1e-6]
|
|
124
|
+
batch_sizes = [16, 32, 64]
|
|
125
|
+
model_names = ["bert-base-uncased", "roberta-base"]
|
|
126
|
+
|
|
127
|
+
# Generate all combinations
|
|
128
|
+
for lr in learning_rates:
|
|
129
|
+
for batch_size in batch_sizes:
|
|
130
|
+
for model_name in model_names:
|
|
131
|
+
config = {
|
|
132
|
+
"learning_rate": lr,
|
|
133
|
+
"batch_size": batch_size,
|
|
134
|
+
"model_name": model_name,
|
|
135
|
+
"epochs": 100,
|
|
136
|
+
"wandb_project": "my-experiment"
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
task_id = queue.add_task(config)
|
|
140
|
+
print(f"Added task {task_id}: {config}")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## API Reference
|
|
144
|
+
|
|
145
|
+
### FileSystemTaskQueueClient
|
|
146
|
+
|
|
147
|
+
The main client class for interacting with the task queue.
|
|
148
|
+
|
|
149
|
+
#### Constructor
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
FileSystemTaskQueueClient(base_dir: str, rank: int = 0)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
- `base_dir`: Directory where task files are stored
|
|
156
|
+
- `rank`: Process rank for multi-process setups (default: 0)
|
|
157
|
+
|
|
158
|
+
#### Methods
|
|
159
|
+
|
|
160
|
+
##### `add_task(task_dict: Dict) -> str`
|
|
161
|
+
Add a new task to the queue.
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
task_id = queue.add_task({"param": "value"})
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
##### `fetch_and_run_task(init_func: Callable, func: Callable) -> bool`
|
|
168
|
+
Fetch a task, initialize it with `init_func`, and run it with `func`.
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
def init_func(**config):
|
|
172
|
+
return trainer, extra_info
|
|
173
|
+
|
|
174
|
+
def run_func(trainer):
|
|
175
|
+
trainer.train()
|
|
176
|
+
|
|
177
|
+
success = queue.fetch_and_run_task(init_func, run_func)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
##### `fetch_task() -> Tuple[str, Dict]`
|
|
181
|
+
Fetch the next available task.
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
task_id, config = queue.fetch_task()
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
##### `finish_current_task()`
|
|
188
|
+
Mark the current task as completed.
|
|
189
|
+
|
|
190
|
+
##### `release_current_task()`
|
|
191
|
+
Release the current task back to the pending queue.
|
|
192
|
+
|
|
193
|
+
##### `edit_current_task(new_dict: Dict)`
|
|
194
|
+
Update the current task's configuration.
|
|
195
|
+
|
|
196
|
+
##### `get_current_task() -> Dict`
|
|
197
|
+
Get the configuration of the current task.
|
|
198
|
+
|
|
199
|
+
##### `has_task() -> bool`
|
|
200
|
+
Check if a task is currently being processed.
|
|
201
|
+
|
|
202
|
+
## Directory Structure
|
|
203
|
+
|
|
204
|
+
TaskCore creates the following directory structure:
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
base_dir/
|
|
208
|
+
├── pending/ # Tasks waiting to be processed
|
|
209
|
+
├── running/ # Tasks currently being processed
|
|
210
|
+
└── finished/ # Completed tasks
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Environment Variables
|
|
214
|
+
|
|
215
|
+
TaskCore uses these environment variables for distributed processing:
|
|
216
|
+
|
|
217
|
+
- `SLURM_JOB_ID`: Job ID from SLURM scheduler
|
|
218
|
+
- `MY_JOB_ID`: Fallback job ID if SLURM_JOB_ID is not available
|
|
219
|
+
- `LOCAL_RANK`: Process rank in multi-process setups
|
|
220
|
+
|
|
221
|
+
## Error Handling
|
|
222
|
+
|
|
223
|
+
Tasks that fail during execution are automatically released back to the pending queue:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
try:
|
|
227
|
+
queue.fetch_and_run_task(init_func, run_func)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f"Task failed: {e}")
|
|
230
|
+
# Task is automatically released back to pending queue
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Stale Task Recovery
|
|
234
|
+
|
|
235
|
+
TaskCore automatically reclaims stale tasks (default: 4 hours timeout):
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
# Custom timeout in seconds
|
|
239
|
+
queue.queue.reclaim_stale_tasks(timeout_seconds=3600) # 1 hour
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# TaskCore
|
|
2
|
+
|
|
3
|
+
A lightweight, filesystem-based task queue system for Python. TaskCore provides a simple way to distribute and manage tasks across multiple processes or machines using the filesystem as the backend.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Simple Setup**: No external dependencies - uses only Python standard library
|
|
8
|
+
- **Filesystem-based**: Tasks are stored as JSON files on disk
|
|
9
|
+
- **Multi-process Support**: Built-in support for distributed task processing
|
|
10
|
+
- **Fault Tolerant**: Automatic task recovery and stale task reclamation
|
|
11
|
+
- **Atomic Operations**: Safe concurrent access using atomic file operations
|
|
12
|
+
- **SLURM Integration**: Works seamlessly with SLURM job schedulers
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install taskcore
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Or install from source:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/goncalorafaria/taskcore
|
|
24
|
+
cd taskcore
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### Basic Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from taskcore import FileSystemTaskQueueClient
|
|
34
|
+
|
|
35
|
+
# Initialize the task queue
|
|
36
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
37
|
+
|
|
38
|
+
# Add tasks to the queue
|
|
39
|
+
task_config = {
|
|
40
|
+
"learning_rate": 1e-4,
|
|
41
|
+
"batch_size": 32,
|
|
42
|
+
"epochs": 100,
|
|
43
|
+
"model_name": "bert-base-uncased"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
task_id = queue.add_task(task_config)
|
|
47
|
+
print(f"Added task with ID: {task_id}")
|
|
48
|
+
|
|
49
|
+
# Process tasks
|
|
50
|
+
def init_function(**config):
|
|
51
|
+
# Initialize your model/trainer here
|
|
52
|
+
trainer = create_trainer(**config)
|
|
53
|
+
return trainer, {"status": "initialized"}
|
|
54
|
+
|
|
55
|
+
def run_function(trainer):
|
|
56
|
+
# Run your training/processing here
|
|
57
|
+
trainer.train()
|
|
58
|
+
|
|
59
|
+
# Fetch and run tasks
|
|
60
|
+
while queue.fetch_and_run_task(init_func=init_function, func=run_function):
|
|
61
|
+
pass
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Multi-Process Example
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import os
|
|
68
|
+
from taskcore import FileSystemTaskQueueClient
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
# Use LOCAL_RANK for multi-process setups (e.g., with torch.distributed)
|
|
72
|
+
rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
73
|
+
queue = FileSystemTaskQueueClient("/path/to/tasks", rank=rank)
|
|
74
|
+
|
|
75
|
+
def init_func(**config):
|
|
76
|
+
# Initialize your distributed training setup
|
|
77
|
+
trainer = create_distributed_trainer(**config)
|
|
78
|
+
return trainer, {"wandb_link": trainer.wandb_link}
|
|
79
|
+
|
|
80
|
+
def run_func(trainer):
|
|
81
|
+
trainer.train()
|
|
82
|
+
|
|
83
|
+
# Process tasks until none are available
|
|
84
|
+
while queue.fetch_and_run_task(init_func=init_func, func=run_func):
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Task Generation Example
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from taskcore import FileSystemTaskQueueClient
|
|
95
|
+
|
|
96
|
+
# Initialize queue
|
|
97
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
98
|
+
|
|
99
|
+
# Define hyperparameter sweep
|
|
100
|
+
learning_rates = [1e-4, 1e-5, 1e-6]
|
|
101
|
+
batch_sizes = [16, 32, 64]
|
|
102
|
+
model_names = ["bert-base-uncased", "roberta-base"]
|
|
103
|
+
|
|
104
|
+
# Generate all combinations
|
|
105
|
+
for lr in learning_rates:
|
|
106
|
+
for batch_size in batch_sizes:
|
|
107
|
+
for model_name in model_names:
|
|
108
|
+
config = {
|
|
109
|
+
"learning_rate": lr,
|
|
110
|
+
"batch_size": batch_size,
|
|
111
|
+
"model_name": model_name,
|
|
112
|
+
"epochs": 100,
|
|
113
|
+
"wandb_project": "my-experiment"
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
task_id = queue.add_task(config)
|
|
117
|
+
print(f"Added task {task_id}: {config}")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## API Reference
|
|
121
|
+
|
|
122
|
+
### FileSystemTaskQueueClient
|
|
123
|
+
|
|
124
|
+
The main client class for interacting with the task queue.
|
|
125
|
+
|
|
126
|
+
#### Constructor
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
FileSystemTaskQueueClient(base_dir: str, rank: int = 0)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
- `base_dir`: Directory where task files are stored
|
|
133
|
+
- `rank`: Process rank for multi-process setups (default: 0)
|
|
134
|
+
|
|
135
|
+
#### Methods
|
|
136
|
+
|
|
137
|
+
##### `add_task(task_dict: Dict) -> str`
|
|
138
|
+
Add a new task to the queue.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
task_id = queue.add_task({"param": "value"})
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
##### `fetch_and_run_task(init_func: Callable, func: Callable) -> bool`
|
|
145
|
+
Fetch a task, initialize it with `init_func`, and run it with `func`.
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
def init_func(**config):
|
|
149
|
+
return trainer, extra_info
|
|
150
|
+
|
|
151
|
+
def run_func(trainer):
|
|
152
|
+
trainer.train()
|
|
153
|
+
|
|
154
|
+
success = queue.fetch_and_run_task(init_func, run_func)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
##### `fetch_task() -> Tuple[str, Dict]`
|
|
158
|
+
Fetch the next available task.
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
task_id, config = queue.fetch_task()
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
##### `finish_current_task()`
|
|
165
|
+
Mark the current task as completed.
|
|
166
|
+
|
|
167
|
+
##### `release_current_task()`
|
|
168
|
+
Release the current task back to the pending queue.
|
|
169
|
+
|
|
170
|
+
##### `edit_current_task(new_dict: Dict)`
|
|
171
|
+
Update the current task's configuration.
|
|
172
|
+
|
|
173
|
+
##### `get_current_task() -> Dict`
|
|
174
|
+
Get the configuration of the current task.
|
|
175
|
+
|
|
176
|
+
##### `has_task() -> bool`
|
|
177
|
+
Check if a task is currently being processed.
|
|
178
|
+
|
|
179
|
+
## Directory Structure
|
|
180
|
+
|
|
181
|
+
TaskCore creates the following directory structure:
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
base_dir/
|
|
185
|
+
├── pending/ # Tasks waiting to be processed
|
|
186
|
+
├── running/ # Tasks currently being processed
|
|
187
|
+
└── finished/ # Completed tasks
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Environment Variables
|
|
191
|
+
|
|
192
|
+
TaskCore uses these environment variables for distributed processing:
|
|
193
|
+
|
|
194
|
+
- `SLURM_JOB_ID`: Job ID from SLURM scheduler
|
|
195
|
+
- `MY_JOB_ID`: Fallback job ID if SLURM_JOB_ID is not available
|
|
196
|
+
- `LOCAL_RANK`: Process rank in multi-process setups
|
|
197
|
+
|
|
198
|
+
## Error Handling
|
|
199
|
+
|
|
200
|
+
Tasks that fail during execution are automatically released back to the pending queue:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
try:
|
|
204
|
+
queue.fetch_and_run_task(init_func, run_func)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"Task failed: {e}")
|
|
207
|
+
# Task is automatically released back to pending queue
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Stale Task Recovery
|
|
211
|
+
|
|
212
|
+
TaskCore automatically reclaims stale tasks (default: 4 hours timeout):
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
# Custom timeout in seconds
|
|
216
|
+
queue.queue.reclaim_stale_tasks(timeout_seconds=3600) # 1 hour
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import setuptools
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
with open("requirements.txt", "r") as fr:
|
|
7
|
+
installation_requirements = fr.readlines()
|
|
8
|
+
|
|
9
|
+
setuptools.setup(
|
|
10
|
+
name="taskcore-lib",
|
|
11
|
+
version="0.0.2",
|
|
12
|
+
author="Goncalo Faria",
|
|
13
|
+
author_email="gfaria@cs.washington.edu",
|
|
14
|
+
description="Package for basic task queue in the filesystem.",
|
|
15
|
+
long_description=long_description,
|
|
16
|
+
long_description_content_type="text/markdown",
|
|
17
|
+
url="https://github.com/goncalorafaria/taskcore",
|
|
18
|
+
packages=setuptools.find_packages(),
|
|
19
|
+
install_requires=installation_requirements,
|
|
20
|
+
python_requires=">=3.6.0",
|
|
21
|
+
classifiers=[
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
],
|
|
26
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .base import FileSystemTaskQueueClient
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import json
|
|
11
|
+
import uuid
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional, Dict
|
|
15
|
+
from taskcore.dist import atomic_write, get_unique_job_id, wait_for_tid_file, get_tid_file_path
|
|
16
|
+
import traceback
|
|
17
|
+
from typing import Callable
|
|
18
|
+
|
|
19
|
+
class FileSystemTaskQueue:
|
|
20
|
+
def __init__(self, base_dir: str):
|
|
21
|
+
self.base_dir = Path(base_dir)
|
|
22
|
+
self.pending_dir = self.base_dir / "pending"
|
|
23
|
+
self.running_dir = self.base_dir / "running"
|
|
24
|
+
self.finished_dir = self.base_dir / "finished"
|
|
25
|
+
# Ensure directories exist
|
|
26
|
+
for d in [self.pending_dir, self.running_dir, self.finished_dir]:
|
|
27
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def add_task(self, task_dict: Dict):
|
|
30
|
+
task_id = str(uuid.uuid4())
|
|
31
|
+
task_file = self.pending_dir / f"{task_id}.json"
|
|
32
|
+
with open(task_file, "w") as f:
|
|
33
|
+
json.dump(task_dict, f)
|
|
34
|
+
return task_id
|
|
35
|
+
|
|
36
|
+
def num_pending_tasks(self) -> int:
|
|
37
|
+
return len(list(self.pending_dir.iterdir()))
|
|
38
|
+
|
|
39
|
+
def num_running_tasks(self) -> int:
|
|
40
|
+
return len(list(self.running_dir.iterdir()))
|
|
41
|
+
|
|
42
|
+
def num_finished_tasks(self) -> int:
|
|
43
|
+
return len(list(self.finished_dir.iterdir()))
|
|
44
|
+
|
|
45
|
+
def read_task(self, task_id: str) -> Dict:
|
|
46
|
+
task_file = self.running_dir / f"{task_id}.json"
|
|
47
|
+
with open(task_file, "r") as f:
|
|
48
|
+
return json.load(f)
|
|
49
|
+
|
|
50
|
+
def fetch_task(self) -> Optional[str]:
|
|
51
|
+
for task_file in self.pending_dir.iterdir():
|
|
52
|
+
running_file = self.running_dir / task_file.name
|
|
53
|
+
try:
|
|
54
|
+
os.rename(task_file, running_file) # atomic move
|
|
55
|
+
return str(running_file)
|
|
56
|
+
except FileNotFoundError:
|
|
57
|
+
continue # another worker got it
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def finish_task(self, running_file: str):
|
|
61
|
+
running_path = Path(running_file)
|
|
62
|
+
finished_file = self.finished_dir / running_path.name
|
|
63
|
+
os.rename(running_path, finished_file)
|
|
64
|
+
|
|
65
|
+
def reset_task_timer(self, running_file: str):
|
|
66
|
+
now = time.time()
|
|
67
|
+
os.utime(running_file, (now, now))
|
|
68
|
+
|
|
69
|
+
def reclaim_stale_tasks(self, timeout_seconds: int = 600):
|
|
70
|
+
now = time.time()
|
|
71
|
+
for task_file in self.running_dir.iterdir():
|
|
72
|
+
if now - task_file.stat().st_mtime > timeout_seconds:
|
|
73
|
+
pending_file = self.pending_dir / task_file.name
|
|
74
|
+
try:
|
|
75
|
+
os.rename(task_file, pending_file)
|
|
76
|
+
except FileNotFoundError:
|
|
77
|
+
continue # Already reclaimed
|
|
78
|
+
|
|
79
|
+
def get_task_dict(self, task_file: str) -> Dict:
|
|
80
|
+
with open(task_file, "r") as f:
|
|
81
|
+
return json.load(f)
|
|
82
|
+
|
|
83
|
+
def update_task_info(
|
|
84
|
+
self,
|
|
85
|
+
task_file: str,
|
|
86
|
+
new_dict: Optional[Dict] = None,
|
|
87
|
+
):
|
|
88
|
+
"""
|
|
89
|
+
Overwrite the entire task file with new_dict if provided.
|
|
90
|
+
Does nothing if new_dict is None.
|
|
91
|
+
"""
|
|
92
|
+
if new_dict is not None:
|
|
93
|
+
path = Path(task_file)
|
|
94
|
+
with open(path, "w") as f:
|
|
95
|
+
json.dump(new_dict, f)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class TaskQueueClient:
|
|
99
|
+
def __init__(self, queue: FileSystemTaskQueue, timeout: int = 60*60*4):
|
|
100
|
+
self.queue = queue
|
|
101
|
+
self.current_task_file = None
|
|
102
|
+
self.timeout = 60*60*4
|
|
103
|
+
|
|
104
|
+
def num_pending_tasks(self) -> int:
|
|
105
|
+
return self.queue.num_pending_tasks()
|
|
106
|
+
|
|
107
|
+
def num_running_tasks(self) -> int:
|
|
108
|
+
return self.queue.num_running_tasks()
|
|
109
|
+
|
|
110
|
+
def num_finished_tasks(self) -> int:
|
|
111
|
+
return self.queue.num_finished_tasks()
|
|
112
|
+
|
|
113
|
+
def read_task(self, task_file: str):
|
|
114
|
+
|
|
115
|
+
return self.queue.read_task(task_file)
|
|
116
|
+
|
|
117
|
+
def fetch_task(self):
|
|
118
|
+
if self.current_task_file is not None:
|
|
119
|
+
raise RuntimeError("A task is already fetched. Finish or release it before fetching another.")
|
|
120
|
+
self.queue.reclaim_stale_tasks(self.timeout) # Reclaim stale tasks before fetching
|
|
121
|
+
task_file = self.queue.fetch_task()
|
|
122
|
+
if task_file:
|
|
123
|
+
self.current_task_file = task_file
|
|
124
|
+
return self.current_task_file, self.get_current_task()
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
def edit_current_task(self, new_dict):
|
|
128
|
+
if self.current_task_file is None:
|
|
129
|
+
raise RuntimeError("No task is currently fetched.")
|
|
130
|
+
self.queue.update_task_info(self.current_task_file, new_dict)
|
|
131
|
+
|
|
132
|
+
def finish_current_task(self):
|
|
133
|
+
if self.current_task_file is None:
|
|
134
|
+
raise RuntimeError("No task is currently fetched.")
|
|
135
|
+
self.queue.finish_task(self.current_task_file)
|
|
136
|
+
self.current_task_file = None
|
|
137
|
+
|
|
138
|
+
def release_current_task(self):
|
|
139
|
+
if self.current_task_file is None:
|
|
140
|
+
raise RuntimeError("No task is currently fetched.")
|
|
141
|
+
# Move the task back to pending
|
|
142
|
+
task_path = Path(self.current_task_file)
|
|
143
|
+
pending_file = self.queue.pending_dir / task_path.name
|
|
144
|
+
os.rename(task_path, pending_file)
|
|
145
|
+
self.current_task_file = None
|
|
146
|
+
|
|
147
|
+
def get_current_task(self):
|
|
148
|
+
if self.current_task_file is None:
|
|
149
|
+
return None
|
|
150
|
+
return self.queue.get_task_dict(self.current_task_file)
|
|
151
|
+
|
|
152
|
+
def has_task(self):
|
|
153
|
+
return self.current_task_file is not None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def dummy_run_func(*args,**kwargs):
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
class FileSystemTaskQueueClient(TaskQueueClient):
|
|
160
|
+
def __init__(self,base_dir: str, rank: int = 0, timeout: int = 60*60*4):
|
|
161
|
+
self.queue = FileSystemTaskQueue(base_dir)
|
|
162
|
+
super().__init__(self.queue, timeout)
|
|
163
|
+
self.rank = rank
|
|
164
|
+
|
|
165
|
+
def add_task(self, task_dict: Dict):
|
|
166
|
+
return self.queue.add_task(task_dict)
|
|
167
|
+
|
|
168
|
+
def fetch_task(self):
|
|
169
|
+
job_id = get_unique_job_id()
|
|
170
|
+
|
|
171
|
+
if self.rank == 0:
|
|
172
|
+
# Main process fetches the next task
|
|
173
|
+
result =super().fetch_task() ## fetch the task on main process
|
|
174
|
+
if not result:
|
|
175
|
+
print("No task found")
|
|
176
|
+
raise RuntimeError("No task found")
|
|
177
|
+
|
|
178
|
+
print(f"Fetched task {result}")
|
|
179
|
+
tid, config = result
|
|
180
|
+
|
|
181
|
+
shared_spot_name = get_tid_file_path(tid) ## get path for the shared spot
|
|
182
|
+
atomic_write(tid, shared_spot_name)## write to a shared spot to tell other processes to fetch the task
|
|
183
|
+
|
|
184
|
+
return tid, config
|
|
185
|
+
else:
|
|
186
|
+
time.sleep(15)
|
|
187
|
+
tid = wait_for_tid_file(job_id) ## wait for new file in shared spot.
|
|
188
|
+
|
|
189
|
+
tid = tid.split("_")[-1].split(".")[0] ## get the tid from the file name
|
|
190
|
+
|
|
191
|
+
config = self.read_task(tid) ## broadcast the task to all processes
|
|
192
|
+
|
|
193
|
+
return tid, config
|
|
194
|
+
|
|
195
|
+
def fetch_and_run_task(self, init_func: Callable, func: Callable = dummy_run_func):
|
|
196
|
+
|
|
197
|
+
tid, config = self.fetch_task() ## takes care of distributing a single task to all processes
|
|
198
|
+
|
|
199
|
+
print(f"Running task {config}")
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
trainer, extra_info = init_func(**config)
|
|
203
|
+
|
|
204
|
+
if self.rank == 0:
|
|
205
|
+
config.update(extra_info)
|
|
206
|
+
#config["wandb_link"] = trainer.wandb_link
|
|
207
|
+
config["task_id"] = tid
|
|
208
|
+
self.edit_current_task(config)
|
|
209
|
+
|
|
210
|
+
func(
|
|
211
|
+
trainer,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if self.rank==0:
|
|
215
|
+
self.finish_current_task()
|
|
216
|
+
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
except Exception as e:
|
|
220
|
+
print(f"Error running task {config}: {e}")
|
|
221
|
+
#print("traceback:", e.__traceback__)
|
|
222
|
+
traceback.print_exc()
|
|
223
|
+
if self.rank==0:
|
|
224
|
+
self.release_current_task()
|
|
225
|
+
|
|
226
|
+
return False
|
|
227
|
+
finally:
|
|
228
|
+
|
|
229
|
+
shared_spot_name = get_tid_file_path(tid)
|
|
230
|
+
# Clean up the tid file after use
|
|
231
|
+
if self.rank==0 and shared_spot_name and os.path.exists(shared_spot_name):
|
|
232
|
+
os.remove(shared_spot_name)
|
|
233
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from taskcore import FileSystemTaskQueueClient
|
|
2
|
+
import json
|
|
3
|
+
from termcolor import colored
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main(base_dir: str, mode:str="status"):
|
|
7
|
+
client = FileSystemTaskQueueClient(base_dir)
|
|
8
|
+
|
|
9
|
+
if mode == "status":
|
|
10
|
+
print(colored(f"Pending tasks: {client.num_pending_tasks()}", 'red', attrs=['bold']))
|
|
11
|
+
print(colored(f"Running tasks: {client.num_running_tasks()}", 'green', attrs=['bold']))
|
|
12
|
+
print(colored(f"Finished tasks: {client.num_finished_tasks()}", 'blue', attrs=['bold']))
|
|
13
|
+
|
|
14
|
+
elif mode == "show":
|
|
15
|
+
|
|
16
|
+
print(colored(f"Pending tasks: {client.num_pending_tasks()}", 'red', attrs=['bold']))
|
|
17
|
+
for task_file in client.queue.pending_dir.iterdir():
|
|
18
|
+
#print("--"*40)
|
|
19
|
+
task_dict = client.queue.get_task_dict(task_file)
|
|
20
|
+
print(json.dumps(task_dict, indent=4))
|
|
21
|
+
print(f"[{colored(task_file.name, 'green')}] {json.dumps(task_dict, indent=4)}")
|
|
22
|
+
|
|
23
|
+
# colored(e.get('variant'),'green')
|
|
24
|
+
print(colored(f"Running tasks: {client.num_running_tasks()}", 'green', attrs=['bold']))
|
|
25
|
+
for task_file in client.queue.running_dir.iterdir():
|
|
26
|
+
#print("--"*40)
|
|
27
|
+
|
|
28
|
+
task_dict = client.queue.get_task_dict(task_file)
|
|
29
|
+
print(f"[{colored(task_file.name, 'green')}]{json.dumps(task_dict, indent=4)}")
|
|
30
|
+
#print(json.dumps(task_dict, indent=4))
|
|
31
|
+
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Invalid mode: {mode}")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
import fire
|
|
39
|
+
fire.Fire(main)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import glob
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
def atomic_write(data, filename):
|
|
8
|
+
tmp_filename = filename + ".tmp"
|
|
9
|
+
with open(tmp_filename, "w") as f:
|
|
10
|
+
f.write(data)
|
|
11
|
+
os.rename(tmp_filename, filename) # Atomic on POSIX
|
|
12
|
+
|
|
13
|
+
def atomic_read(filename):
|
|
14
|
+
with open(filename, "r") as f:
|
|
15
|
+
return f.read().strip()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def wait_for_tid_file(job_id, timeout=60):
|
|
19
|
+
pattern = f"/tmp/{job_id}_*.json"
|
|
20
|
+
start = time.time()
|
|
21
|
+
tid_file = None
|
|
22
|
+
while True:
|
|
23
|
+
files = [f for f in os.listdir("/tmp") if f.startswith(f"{job_id}_") and f.endswith(".json")]
|
|
24
|
+
if files:
|
|
25
|
+
# Get the youngest file (most recently modified)
|
|
26
|
+
file_paths = [os.path.join("/tmp", f) for f in files]
|
|
27
|
+
tid_file = max(file_paths, key=os.path.getmtime)
|
|
28
|
+
# Wait for file to be non-empty
|
|
29
|
+
if os.path.getsize(tid_file) > 0:
|
|
30
|
+
return tid_file
|
|
31
|
+
if time.time() - start > timeout:
|
|
32
|
+
raise TimeoutError("Timeout waiting for tid file")
|
|
33
|
+
time.sleep(0.1)
|
|
34
|
+
|
|
35
|
+
def get_tid_file_path(tid):
|
|
36
|
+
tid_filename = os.path.basename(tid)
|
|
37
|
+
unique_id = os.environ.get("SLURM_JOB_ID") or os.environ.get("MY_JOB_ID")
|
|
38
|
+
if unique_id is None:
|
|
39
|
+
raise RuntimeError("No unique job ID found in environment!")
|
|
40
|
+
return f"/tmp/{unique_id}_{tid_filename}"
|
|
41
|
+
|
|
42
|
+
import uuid
|
|
43
|
+
import time
|
|
44
|
+
|
|
45
|
+
def get_unique_job_id():
|
|
46
|
+
# Use SLURM_JOB_ID if available, else MY_JOB_ID, else generate and set MY_JOB_ID
|
|
47
|
+
job_id = os.environ.get("SLURM_JOB_ID")
|
|
48
|
+
if not job_id:
|
|
49
|
+
job_id = os.environ.get("MY_JOB_ID")
|
|
50
|
+
if not job_id:
|
|
51
|
+
job_id = str(uuid.uuid4())
|
|
52
|
+
os.environ["MY_JOB_ID"] = job_id
|
|
53
|
+
return job_id
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: taskcore-lib
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Package for basic task queue in the filesystem.
|
|
5
|
+
Home-page: https://github.com/goncalorafaria/taskcore
|
|
6
|
+
Author: Goncalo Faria
|
|
7
|
+
Author-email: gfaria@cs.washington.edu
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# TaskCore
|
|
25
|
+
|
|
26
|
+
A lightweight, filesystem-based task queue system for Python. TaskCore provides a simple way to distribute and manage tasks across multiple processes or machines using the filesystem as the backend.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **Simple Setup**: No external dependencies - uses only Python standard library
|
|
31
|
+
- **Filesystem-based**: Tasks are stored as JSON files on disk
|
|
32
|
+
- **Multi-process Support**: Built-in support for distributed task processing
|
|
33
|
+
- **Fault Tolerant**: Automatic task recovery and stale task reclamation
|
|
34
|
+
- **Atomic Operations**: Safe concurrent access using atomic file operations
|
|
35
|
+
- **SLURM Integration**: Works seamlessly with SLURM job schedulers
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install taskcore
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or install from source:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/goncalorafaria/taskcore
|
|
47
|
+
cd taskcore
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### Basic Usage
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from taskcore import FileSystemTaskQueueClient
|
|
57
|
+
|
|
58
|
+
# Initialize the task queue
|
|
59
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
60
|
+
|
|
61
|
+
# Add tasks to the queue
|
|
62
|
+
task_config = {
|
|
63
|
+
"learning_rate": 1e-4,
|
|
64
|
+
"batch_size": 32,
|
|
65
|
+
"epochs": 100,
|
|
66
|
+
"model_name": "bert-base-uncased"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
task_id = queue.add_task(task_config)
|
|
70
|
+
print(f"Added task with ID: {task_id}")
|
|
71
|
+
|
|
72
|
+
# Process tasks
|
|
73
|
+
def init_function(**config):
|
|
74
|
+
# Initialize your model/trainer here
|
|
75
|
+
trainer = create_trainer(**config)
|
|
76
|
+
return trainer, {"status": "initialized"}
|
|
77
|
+
|
|
78
|
+
def run_function(trainer):
|
|
79
|
+
# Run your training/processing here
|
|
80
|
+
trainer.train()
|
|
81
|
+
|
|
82
|
+
# Fetch and run tasks
|
|
83
|
+
while queue.fetch_and_run_task(init_func=init_function, func=run_function):
|
|
84
|
+
pass
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Multi-Process Example
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import os
|
|
91
|
+
from taskcore import FileSystemTaskQueueClient
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
# Use LOCAL_RANK for multi-process setups (e.g., with torch.distributed)
|
|
95
|
+
rank = int(os.environ.get("LOCAL_RANK", 0))
|
|
96
|
+
queue = FileSystemTaskQueueClient("/path/to/tasks", rank=rank)
|
|
97
|
+
|
|
98
|
+
def init_func(**config):
|
|
99
|
+
# Initialize your distributed training setup
|
|
100
|
+
trainer = create_distributed_trainer(**config)
|
|
101
|
+
return trainer, {"wandb_link": trainer.wandb_link}
|
|
102
|
+
|
|
103
|
+
def run_func(trainer):
|
|
104
|
+
trainer.train()
|
|
105
|
+
|
|
106
|
+
# Process tasks until none are available
|
|
107
|
+
while queue.fetch_and_run_task(init_func=init_func, func=run_func):
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
main()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Task Generation Example
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from taskcore import FileSystemTaskQueueClient
|
|
118
|
+
|
|
119
|
+
# Initialize queue
|
|
120
|
+
queue = FileSystemTaskQueueClient("/path/to/task/directory")
|
|
121
|
+
|
|
122
|
+
# Define hyperparameter sweep
|
|
123
|
+
learning_rates = [1e-4, 1e-5, 1e-6]
|
|
124
|
+
batch_sizes = [16, 32, 64]
|
|
125
|
+
model_names = ["bert-base-uncased", "roberta-base"]
|
|
126
|
+
|
|
127
|
+
# Generate all combinations
|
|
128
|
+
for lr in learning_rates:
|
|
129
|
+
for batch_size in batch_sizes:
|
|
130
|
+
for model_name in model_names:
|
|
131
|
+
config = {
|
|
132
|
+
"learning_rate": lr,
|
|
133
|
+
"batch_size": batch_size,
|
|
134
|
+
"model_name": model_name,
|
|
135
|
+
"epochs": 100,
|
|
136
|
+
"wandb_project": "my-experiment"
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
task_id = queue.add_task(config)
|
|
140
|
+
print(f"Added task {task_id}: {config}")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## API Reference
|
|
144
|
+
|
|
145
|
+
### FileSystemTaskQueueClient
|
|
146
|
+
|
|
147
|
+
The main client class for interacting with the task queue.
|
|
148
|
+
|
|
149
|
+
#### Constructor
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
FileSystemTaskQueueClient(base_dir: str, rank: int = 0)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
- `base_dir`: Directory where task files are stored
|
|
156
|
+
- `rank`: Process rank for multi-process setups (default: 0)
|
|
157
|
+
|
|
158
|
+
#### Methods
|
|
159
|
+
|
|
160
|
+
##### `add_task(task_dict: Dict) -> str`
|
|
161
|
+
Add a new task to the queue.
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
task_id = queue.add_task({"param": "value"})
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
##### `fetch_and_run_task(init_func: Callable, func: Callable) -> bool`
|
|
168
|
+
Fetch a task, initialize it with `init_func`, and run it with `func`.
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
def init_func(**config):
|
|
172
|
+
return trainer, extra_info
|
|
173
|
+
|
|
174
|
+
def run_func(trainer):
|
|
175
|
+
trainer.train()
|
|
176
|
+
|
|
177
|
+
success = queue.fetch_and_run_task(init_func, run_func)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
##### `fetch_task() -> Tuple[str, Dict]`
|
|
181
|
+
Fetch the next available task.
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
task_id, config = queue.fetch_task()
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
##### `finish_current_task()`
|
|
188
|
+
Mark the current task as completed.
|
|
189
|
+
|
|
190
|
+
##### `release_current_task()`
|
|
191
|
+
Release the current task back to the pending queue.
|
|
192
|
+
|
|
193
|
+
##### `edit_current_task(new_dict: Dict)`
|
|
194
|
+
Update the current task's configuration.
|
|
195
|
+
|
|
196
|
+
##### `get_current_task() -> Dict`
|
|
197
|
+
Get the configuration of the current task.
|
|
198
|
+
|
|
199
|
+
##### `has_task() -> bool`
|
|
200
|
+
Check if a task is currently being processed.
|
|
201
|
+
|
|
202
|
+
## Directory Structure
|
|
203
|
+
|
|
204
|
+
TaskCore creates the following directory structure:
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
base_dir/
|
|
208
|
+
├── pending/ # Tasks waiting to be processed
|
|
209
|
+
├── running/ # Tasks currently being processed
|
|
210
|
+
└── finished/ # Completed tasks
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Environment Variables
|
|
214
|
+
|
|
215
|
+
TaskCore uses these environment variables for distributed processing:
|
|
216
|
+
|
|
217
|
+
- `SLURM_JOB_ID`: Job ID from SLURM scheduler
|
|
218
|
+
- `MY_JOB_ID`: Fallback job ID if SLURM_JOB_ID is not available
|
|
219
|
+
- `LOCAL_RANK`: Process rank in multi-process setups
|
|
220
|
+
|
|
221
|
+
## Error Handling
|
|
222
|
+
|
|
223
|
+
Tasks that fail during execution are automatically released back to the pending queue:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
try:
|
|
227
|
+
queue.fetch_and_run_task(init_func, run_func)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f"Task failed: {e}")
|
|
230
|
+
# Task is automatically released back to pending queue
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Stale Task Recovery
|
|
234
|
+
|
|
235
|
+
TaskCore automatically reclaims stale tasks (default: 4 hours timeout):
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
# Custom timeout in seconds
|
|
239
|
+
queue.queue.reclaim_stale_tasks(timeout_seconds=3600) # 1 hour
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
taskcore/__init__.py
|
|
5
|
+
taskcore/base.py
|
|
6
|
+
taskcore/cli.py
|
|
7
|
+
taskcore/dist.py
|
|
8
|
+
taskcore_lib.egg-info/PKG-INFO
|
|
9
|
+
taskcore_lib.egg-info/SOURCES.txt
|
|
10
|
+
taskcore_lib.egg-info/dependency_links.txt
|
|
11
|
+
taskcore_lib.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
taskcore
|