taskcore-lib 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
taskcore/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .base import FileSystemTaskQueueClient
taskcore/base.py ADDED
@@ -0,0 +1,233 @@
1
+ import json
2
+ import uuid
3
+ from pathlib import Path
4
+ import os
5
+ import time
6
+ import os
7
+ import time
8
+
9
+ import os
10
+ import json
11
+ import uuid
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Optional, Dict
15
+ from taskcore.dist import atomic_write, get_unique_job_id, wait_for_tid_file, get_tid_file_path
16
+ import traceback
17
+ from typing import Callable
18
+
19
+ class FileSystemTaskQueue:
20
+ def __init__(self, base_dir: str):
21
+ self.base_dir = Path(base_dir)
22
+ self.pending_dir = self.base_dir / "pending"
23
+ self.running_dir = self.base_dir / "running"
24
+ self.finished_dir = self.base_dir / "finished"
25
+ # Ensure directories exist
26
+ for d in [self.pending_dir, self.running_dir, self.finished_dir]:
27
+ d.mkdir(parents=True, exist_ok=True)
28
+
29
+ def add_task(self, task_dict: Dict):
30
+ task_id = str(uuid.uuid4())
31
+ task_file = self.pending_dir / f"{task_id}.json"
32
+ with open(task_file, "w") as f:
33
+ json.dump(task_dict, f)
34
+ return task_id
35
+
36
+ def num_pending_tasks(self) -> int:
37
+ return len(list(self.pending_dir.iterdir()))
38
+
39
+ def num_running_tasks(self) -> int:
40
+ return len(list(self.running_dir.iterdir()))
41
+
42
+ def num_finished_tasks(self) -> int:
43
+ return len(list(self.finished_dir.iterdir()))
44
+
45
+ def read_task(self, task_id: str) -> Dict:
46
+ task_file = self.running_dir / f"{task_id}.json"
47
+ with open(task_file, "r") as f:
48
+ return json.load(f)
49
+
50
+ def fetch_task(self) -> Optional[str]:
51
+ for task_file in self.pending_dir.iterdir():
52
+ running_file = self.running_dir / task_file.name
53
+ try:
54
+ os.rename(task_file, running_file) # atomic move
55
+ return str(running_file)
56
+ except FileNotFoundError:
57
+ continue # another worker got it
58
+ return None
59
+
60
+ def finish_task(self, running_file: str):
61
+ running_path = Path(running_file)
62
+ finished_file = self.finished_dir / running_path.name
63
+ os.rename(running_path, finished_file)
64
+
65
+ def reset_task_timer(self, running_file: str):
66
+ now = time.time()
67
+ os.utime(running_file, (now, now))
68
+
69
+ def reclaim_stale_tasks(self, timeout_seconds: int = 600):
70
+ now = time.time()
71
+ for task_file in self.running_dir.iterdir():
72
+ if now - task_file.stat().st_mtime > timeout_seconds:
73
+ pending_file = self.pending_dir / task_file.name
74
+ try:
75
+ os.rename(task_file, pending_file)
76
+ except FileNotFoundError:
77
+ continue # Already reclaimed
78
+
79
+ def get_task_dict(self, task_file: str) -> Dict:
80
+ with open(task_file, "r") as f:
81
+ return json.load(f)
82
+
83
+ def update_task_info(
84
+ self,
85
+ task_file: str,
86
+ new_dict: Optional[Dict] = None,
87
+ ):
88
+ """
89
+ Overwrite the entire task file with new_dict if provided.
90
+ Does nothing if new_dict is None.
91
+ """
92
+ if new_dict is not None:
93
+ path = Path(task_file)
94
+ with open(path, "w") as f:
95
+ json.dump(new_dict, f)
96
+
97
+
98
+ class TaskQueueClient:
99
+ def __init__(self, queue: FileSystemTaskQueue, timeout: int = 60*60*4):
100
+ self.queue = queue
101
+ self.current_task_file = None
102
+ self.timeout = 60*60*4
103
+
104
+ def num_pending_tasks(self) -> int:
105
+ return self.queue.num_pending_tasks()
106
+
107
+ def num_running_tasks(self) -> int:
108
+ return self.queue.num_running_tasks()
109
+
110
+ def num_finished_tasks(self) -> int:
111
+ return self.queue.num_finished_tasks()
112
+
113
+ def read_task(self, task_file: str):
114
+
115
+ return self.queue.read_task(task_file)
116
+
117
+ def fetch_task(self):
118
+ if self.current_task_file is not None:
119
+ raise RuntimeError("A task is already fetched. Finish or release it before fetching another.")
120
+ self.queue.reclaim_stale_tasks(self.timeout) # Reclaim stale tasks before fetching
121
+ task_file = self.queue.fetch_task()
122
+ if task_file:
123
+ self.current_task_file = task_file
124
+ return self.current_task_file, self.get_current_task()
125
+ return None
126
+
127
+ def edit_current_task(self, new_dict):
128
+ if self.current_task_file is None:
129
+ raise RuntimeError("No task is currently fetched.")
130
+ self.queue.update_task_info(self.current_task_file, new_dict)
131
+
132
+ def finish_current_task(self):
133
+ if self.current_task_file is None:
134
+ raise RuntimeError("No task is currently fetched.")
135
+ self.queue.finish_task(self.current_task_file)
136
+ self.current_task_file = None
137
+
138
+ def release_current_task(self):
139
+ if self.current_task_file is None:
140
+ raise RuntimeError("No task is currently fetched.")
141
+ # Move the task back to pending
142
+ task_path = Path(self.current_task_file)
143
+ pending_file = self.queue.pending_dir / task_path.name
144
+ os.rename(task_path, pending_file)
145
+ self.current_task_file = None
146
+
147
+ def get_current_task(self):
148
+ if self.current_task_file is None:
149
+ return None
150
+ return self.queue.get_task_dict(self.current_task_file)
151
+
152
+ def has_task(self):
153
+ return self.current_task_file is not None
154
+
155
+
156
+ def dummy_run_func(*args,**kwargs):
157
+ pass
158
+
159
+ class FileSystemTaskQueueClient(TaskQueueClient):
160
+ def __init__(self,base_dir: str, rank: int = 0, timeout: int = 60*60*4):
161
+ self.queue = FileSystemTaskQueue(base_dir)
162
+ super().__init__(self.queue, timeout)
163
+ self.rank = rank
164
+
165
+ def add_task(self, task_dict: Dict):
166
+ return self.queue.add_task(task_dict)
167
+
168
+ def fetch_task(self):
169
+ job_id = get_unique_job_id()
170
+
171
+ if self.rank == 0:
172
+ # Main process fetches the next task
173
+ result =super().fetch_task() ## fetch the task on main process
174
+ if not result:
175
+ print("No task found")
176
+ raise RuntimeError("No task found")
177
+
178
+ print(f"Fetched task {result}")
179
+ tid, config = result
180
+
181
+ shared_spot_name = get_tid_file_path(tid) ## get path for the shared spot
182
+ atomic_write(tid, shared_spot_name)## write to a shared spot to tell other processes to fetch the task
183
+
184
+ return tid, config
185
+ else:
186
+ time.sleep(15)
187
+ tid = wait_for_tid_file(job_id) ## wait for new file in shared spot.
188
+
189
+ tid = tid.split("_")[-1].split(".")[0] ## get the tid from the file name
190
+
191
+ config = self.read_task(tid) ## broadcast the task to all processes
192
+
193
+ return tid, config
194
+
195
+ def fetch_and_run_task(self, init_func: Callable, func: Callable = dummy_run_func):
196
+
197
+ tid, config = self.fetch_task() ## takes care of distributing a single task to all processes
198
+
199
+ print(f"Running task {config}")
200
+
201
+ try:
202
+ trainer, extra_info = init_func(**config)
203
+
204
+ if self.rank == 0:
205
+ config.update(extra_info)
206
+ #config["wandb_link"] = trainer.wandb_link
207
+ config["task_id"] = tid
208
+ self.edit_current_task(config)
209
+
210
+ func(
211
+ trainer,
212
+ )
213
+
214
+ if self.rank==0:
215
+ self.finish_current_task()
216
+
217
+ return True
218
+
219
+ except Exception as e:
220
+ print(f"Error running task {config}: {e}")
221
+ #print("traceback:", e.__traceback__)
222
+ traceback.print_exc()
223
+ if self.rank==0:
224
+ self.release_current_task()
225
+
226
+ return False
227
+ finally:
228
+
229
+ shared_spot_name = get_tid_file_path(tid)
230
+ # Clean up the tid file after use
231
+ if self.rank==0 and shared_spot_name and os.path.exists(shared_spot_name):
232
+ os.remove(shared_spot_name)
233
+
taskcore/cli.py ADDED
@@ -0,0 +1,39 @@
1
+ from taskcore import FileSystemTaskQueueClient
2
+ import json
3
+ from termcolor import colored
4
+
5
+
6
+ def main(base_dir: str, mode:str="status"):
7
+ client = FileSystemTaskQueueClient(base_dir)
8
+
9
+ if mode == "status":
10
+ print(colored(f"Pending tasks: {client.num_pending_tasks()}", 'red', attrs=['bold']))
11
+ print(colored(f"Running tasks: {client.num_running_tasks()}", 'green', attrs=['bold']))
12
+ print(colored(f"Finished tasks: {client.num_finished_tasks()}", 'blue', attrs=['bold']))
13
+
14
+ elif mode == "show":
15
+
16
+ print(colored(f"Pending tasks: {client.num_pending_tasks()}", 'red', attrs=['bold']))
17
+ for task_file in client.queue.pending_dir.iterdir():
18
+ #print("--"*40)
19
+ task_dict = client.queue.get_task_dict(task_file)
20
+ print(json.dumps(task_dict, indent=4))
21
+ print(f"[{colored(task_file.name, 'green')}] {json.dumps(task_dict, indent=4)}")
22
+
23
+ # colored(e.get('variant'),'green')
24
+ print(colored(f"Running tasks: {client.num_running_tasks()}", 'green', attrs=['bold']))
25
+ for task_file in client.queue.running_dir.iterdir():
26
+ #print("--"*40)
27
+
28
+ task_dict = client.queue.get_task_dict(task_file)
29
+ print(f"[{colored(task_file.name, 'green')}]{json.dumps(task_dict, indent=4)}")
30
+ #print(json.dumps(task_dict, indent=4))
31
+
32
+ else:
33
+ raise ValueError(f"Invalid mode: {mode}")
34
+
35
+
36
+
37
+ if __name__ == "__main__":
38
+ import fire
39
+ fire.Fire(main)
taskcore/dist.py ADDED
@@ -0,0 +1,53 @@
1
+
2
+
3
+ import os
4
+ import glob
5
+ import time
6
+
7
+ def atomic_write(data, filename):
8
+ tmp_filename = filename + ".tmp"
9
+ with open(tmp_filename, "w") as f:
10
+ f.write(data)
11
+ os.rename(tmp_filename, filename) # Atomic on POSIX
12
+
13
+ def atomic_read(filename):
14
+ with open(filename, "r") as f:
15
+ return f.read().strip()
16
+
17
+
18
+ def wait_for_tid_file(job_id, timeout=60):
19
+ pattern = f"/tmp/{job_id}_*.json"
20
+ start = time.time()
21
+ tid_file = None
22
+ while True:
23
+ files = [f for f in os.listdir("/tmp") if f.startswith(f"{job_id}_") and f.endswith(".json")]
24
+ if files:
25
+ # Get the youngest file (most recently modified)
26
+ file_paths = [os.path.join("/tmp", f) for f in files]
27
+ tid_file = max(file_paths, key=os.path.getmtime)
28
+ # Wait for file to be non-empty
29
+ if os.path.getsize(tid_file) > 0:
30
+ return tid_file
31
+ if time.time() - start > timeout:
32
+ raise TimeoutError("Timeout waiting for tid file")
33
+ time.sleep(0.1)
34
+
35
+ def get_tid_file_path(tid):
36
+ tid_filename = os.path.basename(tid)
37
+ unique_id = os.environ.get("SLURM_JOB_ID") or os.environ.get("MY_JOB_ID")
38
+ if unique_id is None:
39
+ raise RuntimeError("No unique job ID found in environment!")
40
+ return f"/tmp/{unique_id}_{tid_filename}"
41
+
42
+ import uuid
43
+ import time
44
+
45
+ def get_unique_job_id():
46
+ # Use SLURM_JOB_ID if available, else MY_JOB_ID, else generate and set MY_JOB_ID
47
+ job_id = os.environ.get("SLURM_JOB_ID")
48
+ if not job_id:
49
+ job_id = os.environ.get("MY_JOB_ID")
50
+ if not job_id:
51
+ job_id = str(uuid.uuid4())
52
+ os.environ["MY_JOB_ID"] = job_id
53
+ return job_id
@@ -0,0 +1,244 @@
1
+ Metadata-Version: 2.4
2
+ Name: taskcore-lib
3
+ Version: 0.0.2
4
+ Summary: Package for basic task queue in the filesystem.
5
+ Home-page: https://github.com/goncalorafaria/taskcore
6
+ Author: Goncalo Faria
7
+ Author-email: gfaria@cs.washington.edu
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6.0
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: home-page
20
+ Dynamic: license-file
21
+ Dynamic: requires-python
22
+ Dynamic: summary
23
+
24
+ # TaskCore
25
+
26
+ A lightweight, filesystem-based task queue system for Python. TaskCore provides a simple way to distribute and manage tasks across multiple processes or machines using the filesystem as the backend.
27
+
28
+ ## Features
29
+
30
+ - **Simple Setup**: No external dependencies - uses only Python standard library
31
+ - **Filesystem-based**: Tasks are stored as JSON files on disk
32
+ - **Multi-process Support**: Built-in support for distributed task processing
33
+ - **Fault Tolerant**: Automatic task recovery and stale task reclamation
34
+ - **Atomic Operations**: Safe concurrent access using atomic file operations
35
+ - **SLURM Integration**: Works seamlessly with SLURM job schedulers
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install taskcore
41
+ ```
42
+
43
+ Or install from source:
44
+
45
+ ```bash
46
+ git clone https://github.com/goncalorafaria/taskcore
47
+ cd taskcore
48
+ pip install -e .
49
+ ```
50
+
51
+ ## Quick Start
52
+
53
+ ### Basic Usage
54
+
55
+ ```python
56
+ from taskcore import FileSystemTaskQueueClient
57
+
58
+ # Initialize the task queue
59
+ queue = FileSystemTaskQueueClient("/path/to/task/directory")
60
+
61
+ # Add tasks to the queue
62
+ task_config = {
63
+ "learning_rate": 1e-4,
64
+ "batch_size": 32,
65
+ "epochs": 100,
66
+ "model_name": "bert-base-uncased"
67
+ }
68
+
69
+ task_id = queue.add_task(task_config)
70
+ print(f"Added task with ID: {task_id}")
71
+
72
+ # Process tasks
73
+ def init_function(**config):
74
+ # Initialize your model/trainer here
75
+ trainer = create_trainer(**config)
76
+ return trainer, {"status": "initialized"}
77
+
78
+ def run_function(trainer):
79
+ # Run your training/processing here
80
+ trainer.train()
81
+
82
+ # Fetch and run tasks
83
+ while queue.fetch_and_run_task(init_func=init_function, func=run_function):
84
+ pass
85
+ ```
86
+
87
+ ### Multi-Process Example
88
+
89
+ ```python
90
+ import os
91
+ from taskcore import FileSystemTaskQueueClient
92
+
93
+ def main():
94
+ # Use LOCAL_RANK for multi-process setups (e.g., with torch.distributed)
95
+ rank = int(os.environ.get("LOCAL_RANK", 0))
96
+ queue = FileSystemTaskQueueClient("/path/to/tasks", rank=rank)
97
+
98
+ def init_func(**config):
99
+ # Initialize your distributed training setup
100
+ trainer = create_distributed_trainer(**config)
101
+ return trainer, {"wandb_link": trainer.wandb_link}
102
+
103
+ def run_func(trainer):
104
+ trainer.train()
105
+
106
+ # Process tasks until none are available
107
+ while queue.fetch_and_run_task(init_func=init_func, func=run_func):
108
+ pass
109
+
110
+ if __name__ == "__main__":
111
+ main()
112
+ ```
113
+
114
+ ### Task Generation Example
115
+
116
+ ```python
117
+ from taskcore import FileSystemTaskQueueClient
118
+
119
+ # Initialize queue
120
+ queue = FileSystemTaskQueueClient("/path/to/task/directory")
121
+
122
+ # Define hyperparameter sweep
123
+ learning_rates = [1e-4, 1e-5, 1e-6]
124
+ batch_sizes = [16, 32, 64]
125
+ model_names = ["bert-base-uncased", "roberta-base"]
126
+
127
+ # Generate all combinations
128
+ for lr in learning_rates:
129
+ for batch_size in batch_sizes:
130
+ for model_name in model_names:
131
+ config = {
132
+ "learning_rate": lr,
133
+ "batch_size": batch_size,
134
+ "model_name": model_name,
135
+ "epochs": 100,
136
+ "wandb_project": "my-experiment"
137
+ }
138
+
139
+ task_id = queue.add_task(config)
140
+ print(f"Added task {task_id}: {config}")
141
+ ```
142
+
143
+ ## API Reference
144
+
145
+ ### FileSystemTaskQueueClient
146
+
147
+ The main client class for interacting with the task queue.
148
+
149
+ #### Constructor
150
+
151
+ ```python
152
+ FileSystemTaskQueueClient(base_dir: str, rank: int = 0)
153
+ ```
154
+
155
+ - `base_dir`: Directory where task files are stored
156
+ - `rank`: Process rank for multi-process setups (default: 0)
157
+
158
+ #### Methods
159
+
160
+ ##### `add_task(task_dict: Dict) -> str`
161
+ Add a new task to the queue.
162
+
163
+ ```python
164
+ task_id = queue.add_task({"param": "value"})
165
+ ```
166
+
167
+ ##### `fetch_and_run_task(init_func: Callable, func: Callable) -> bool`
168
+ Fetch a task, initialize it with `init_func`, and run it with `func`.
169
+
170
+ ```python
171
+ def init_func(**config):
172
+ return trainer, extra_info
173
+
174
+ def run_func(trainer):
175
+ trainer.train()
176
+
177
+ success = queue.fetch_and_run_task(init_func, run_func)
178
+ ```
179
+
180
+ ##### `fetch_task() -> Tuple[str, Dict]`
181
+ Fetch the next available task.
182
+
183
+ ```python
184
+ task_id, config = queue.fetch_task()
185
+ ```
186
+
187
+ ##### `finish_current_task()`
188
+ Mark the current task as completed.
189
+
190
+ ##### `release_current_task()`
191
+ Release the current task back to the pending queue.
192
+
193
+ ##### `edit_current_task(new_dict: Dict)`
194
+ Update the current task's configuration.
195
+
196
+ ##### `get_current_task() -> Dict`
197
+ Get the configuration of the current task.
198
+
199
+ ##### `has_task() -> bool`
200
+ Check if a task is currently being processed.
201
+
202
+ ## Directory Structure
203
+
204
+ TaskCore creates the following directory structure:
205
+
206
+ ```
207
+ base_dir/
208
+ ├── pending/ # Tasks waiting to be processed
209
+ ├── running/ # Tasks currently being processed
210
+ └── finished/ # Completed tasks
211
+ ```
212
+
213
+ ## Environment Variables
214
+
215
+ TaskCore uses these environment variables for distributed processing:
216
+
217
+ - `SLURM_JOB_ID`: Job ID from SLURM scheduler
218
+ - `MY_JOB_ID`: Fallback job ID if SLURM_JOB_ID is not available
219
+ - `LOCAL_RANK`: Process rank in multi-process setups
220
+
221
+ ## Error Handling
222
+
223
+ Tasks that fail during execution are automatically released back to the pending queue:
224
+
225
+ ```python
226
+ try:
227
+ queue.fetch_and_run_task(init_func, run_func)
228
+ except Exception as e:
229
+ print(f"Task failed: {e}")
230
+ # Task is automatically released back to pending queue
231
+ ```
232
+
233
+ ## Stale Task Recovery
234
+
235
+ TaskCore automatically reclaims stale tasks (default: 4 hours timeout):
236
+
237
+ ```python
238
+ # Custom timeout in seconds
239
+ queue.queue.reclaim_stale_tasks(timeout_seconds=3600) # 1 hour
240
+ ```
241
+
242
+ ## License
243
+
244
+ MIT License - see [LICENSE](LICENSE) file for details.
@@ -0,0 +1,9 @@
1
+ taskcore/__init__.py,sha256=5qiCW0vnEOZmiTQEEahEGLtOWJOv3Ors6xoEXqwV6io,43
2
+ taskcore/base.py,sha256=hZlc2Tq7nU2UcxnoJs20-2iH-zQXBT3KdNFetWG1lr4,8019
3
+ taskcore/cli.py,sha256=uC7nhjxglDRi-SaxpFfZCgzP1_WkHwPvhyYPWZqxlGI,1533
4
+ taskcore/dist.py,sha256=6qaFc85YsbgFbvCgBCII7OYeP368f0m4IKF-pF-LLog,1692
5
+ taskcore_lib-0.0.2.dist-info/licenses/LICENSE,sha256=ldUsph09cc1l_w5lD-IeZQR_RmAr_IfCq1m1sK7timk,1070
6
+ taskcore_lib-0.0.2.dist-info/METADATA,sha256=3CuXnfjFcLRbqPgGpHfkIToLrtjpoO0DBQ-FG1bazUc,6164
7
+ taskcore_lib-0.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ taskcore_lib-0.0.2.dist-info/top_level.txt,sha256=SYLIoRvq8EDnjU5FdNfJldfSFwgHcfwij8xqu3bREAg,9
9
+ taskcore_lib-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Gonçalo Faria
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ taskcore