task-checkpoint 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- task_checkpoint-0.1.0.dist-info/METADATA +242 -0
- task_checkpoint-0.1.0.dist-info/RECORD +16 -0
- task_checkpoint-0.1.0.dist-info/WHEEL +5 -0
- task_checkpoint-0.1.0.dist-info/licenses/LICENSE +21 -0
- task_checkpoint-0.1.0.dist-info/top_level.txt +1 -0
- task_pool/__init__.py +28 -0
- task_pool/codecs.py +18 -0
- task_pool/core.py +339 -0
- task_pool/models.py +45 -0
- task_pool/stores/__init__.py +14 -0
- task_pool/stores/base.py +32 -0
- task_pool/stores/csv_file.py +46 -0
- task_pool/stores/json_file.py +179 -0
- task_pool/stores/source_backed.py +65 -0
- task_pool/stores/sqlite.py +246 -0
- task_pool/stores/text_file.py +34 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: task-checkpoint
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A small Python library for resumable task pools in long-running scripts.
|
|
5
|
+
Author: 1nvisibleCat
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: checkpoint,task-checkpoint,task-pool,resume,batch-processing,job-queue
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Operating System :: POSIX
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# Task Pool
|
|
26
|
+
|
|
27
|
+
Task Pool is a small Python library for long-running scripts that need to process many small jobs without losing progress.
|
|
28
|
+
|
|
29
|
+
Turn a fragile `for` loop into a resumable task pool.
|
|
30
|
+
|
|
31
|
+
When a run stops halfway, Task Pool remembers what is done, what is running, and what still needs work. You can restart the script and continue from the last known point.
|
|
32
|
+
|
|
33
|
+
Use it for crawling, experiments, data analysis, batch API calls, file processing, and other work where unfinished items should be easy to resume.
|
|
34
|
+
|
|
35
|
+
It feels like a tiny job queue, but stays inside your Python script. For local use, no database server, message queue, or service setup is needed.
|
|
36
|
+
|
|
37
|
+
## Quickstart
|
|
38
|
+
|
|
39
|
+
Install it with:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install task-checkpoint
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Put one URL per line:
|
|
46
|
+
|
|
47
|
+
```text
|
|
48
|
+
https://example.com/a
|
|
49
|
+
https://example.com/b
|
|
50
|
+
https://example.com/c
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Then process the file as a task pool:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from task_pool import TaskPool
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
pool = TaskPool("urls.txt")
|
|
60
|
+
|
|
61
|
+
def fetch_one(payload):
|
|
62
|
+
fetch_url(payload["line"])
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
pool.for_each(fetch_one)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
You can also use a regular loop:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
for payload in pool:
|
|
72
|
+
fetch_url(payload["line"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
If you need task metadata, use `iter_tasks()`:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
for task in pool.iter_tasks():
|
|
79
|
+
url = task.payload["line"]
|
|
80
|
+
result = fetch_url(url)
|
|
81
|
+
save_result(task.key, result)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Each completed task is committed. If an exception is raised, the current task is rolled back to `not_start` and the exception is raised again.
|
|
85
|
+
|
|
86
|
+
For very small scripts, a lambda also works:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
pool.for_each(lambda payload: fetch_url(payload["line"]))
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
You can print a small progress summary:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
print(pool.stats())
|
|
96
|
+
# {"total": 10, "not_start": 3, "pending": 0, "committed": 7}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
For structured jobs, use a JSON task pool:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
pool = TaskPool("tasks.json")
|
|
103
|
+
|
|
104
|
+
pool.append({"input_path": "data/a.json", "method": "baseline"})
|
|
105
|
+
pool.append({"input_path": "data/b.json", "method": "baseline"})
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Manual Control
|
|
109
|
+
|
|
110
|
+
Use `lease()` when you want one task at a time and need explicit control.
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
pool = TaskPool("tasks.json")
|
|
114
|
+
|
|
115
|
+
with pool.lease() as task:
|
|
116
|
+
if task is None:
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
do_work(task.payload)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
If there is no task to process, `lease()` returns `None`.
|
|
123
|
+
|
|
124
|
+
Leaving the `with` block normally commits the task. If an exception is raised, the task is rolled back automatically.
|
|
125
|
+
|
|
126
|
+
## Store Selection
|
|
127
|
+
|
|
128
|
+
You can choose a store by file suffix:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
TaskPool("tasks.json") # JSONFileStore
|
|
132
|
+
TaskPool("tasks.sqlite") # SQLiteStore
|
|
133
|
+
TaskPool("tasks.sqlite3") # SQLiteStore
|
|
134
|
+
TaskPool("tasks.db") # SQLiteStore
|
|
135
|
+
TaskPool("rows.csv") # CSVRowStore with sidecar state
|
|
136
|
+
TaskPool("rows.tsv") # CSVRowStore with sidecar state
|
|
137
|
+
TaskPool("urls.txt") # TextLineStore with sidecar state
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
You can also pass a store explicitly:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from task_pool import JSONFileStore, SQLiteStore, TaskPool
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
pool = TaskPool(JSONFileStore("tasks.json"))
|
|
147
|
+
pool = TaskPool(SQLiteStore("tasks.db"))
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Source Files
|
|
151
|
+
|
|
152
|
+
Task Pool can also read tasks from source files.
|
|
153
|
+
|
|
154
|
+
### Text files
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
pool = TaskPool("urls.txt")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
|
|
162
|
+
```text
|
|
163
|
+
https://example.com/a
|
|
164
|
+
https://example.com/b
|
|
165
|
+
https://example.com/c
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Each non-empty line becomes a task:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
{
|
|
172
|
+
"line": "https://example.com/a",
|
|
173
|
+
"line_number": 1,
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### CSV and TSV files
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
pool = TaskPool("rows.csv")
|
|
181
|
+
pool = TaskPool("rows.tsv")
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
By default, CSV and TSV files are read with a header row:
|
|
185
|
+
|
|
186
|
+
```csv
|
|
187
|
+
code,name
|
|
188
|
+
A0001,Alpha
|
|
189
|
+
A0002,Beta
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The first row becomes:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
{"code": "A0001", "name": "Alpha"}
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
For files without headers:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
pool = TaskPool.csv("rows.csv", has_header=False)
|
|
202
|
+
pool = TaskPool.tsv("rows.tsv", has_header=False)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Rows are named `col1`, `col2`, and so on:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
{"col1": "A0001", "col2": "Alpha"}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Source files are not modified. Progress is stored in a sidecar JSON file, for example:
|
|
212
|
+
|
|
213
|
+
```text
|
|
214
|
+
rows.csv.task_pool.json
|
|
215
|
+
urls.txt.task_pool.json
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Task Status
|
|
219
|
+
|
|
220
|
+
Each task has one of these statuses:
|
|
221
|
+
|
|
222
|
+
- `not_start`
|
|
223
|
+
- `pending`
|
|
224
|
+
- `committed`
|
|
225
|
+
|
|
226
|
+
The common flow is:
|
|
227
|
+
|
|
228
|
+
```text
|
|
229
|
+
not_start -> pending -> committed
|
|
230
|
+
|
|
|
231
|
+
-> not_start
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
`pending` means a worker has leased the task and is currently processing it.
|
|
235
|
+
|
|
236
|
+
## Notes
|
|
237
|
+
|
|
238
|
+
- The default store is `JSONFileStore("task_pool.json")`.
|
|
239
|
+
- Payloads are stored as JSON.
|
|
240
|
+
- JSON and source-backed stores use file locks and atomic writes.
|
|
241
|
+
- SQLite is better for larger pools or heavier concurrent use.
|
|
242
|
+
- Source-backed stores keep the source file unchanged and write progress to a sidecar JSON file.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
task_checkpoint-0.1.0.dist-info/licenses/LICENSE,sha256=dU9CUn2-fRJIT0nzCRHsH7NqYT_dVawztFs6eqUCvQY,1069
|
|
2
|
+
task_pool/__init__.py,sha256=F2UIz6jdPhxhyagCidX1z7k7u5lgPa_pvbJ_EsEM1SE,525
|
|
3
|
+
task_pool/codecs.py,sha256=M8eUaYTNXx6Axr_o0L6EAgZpT3ZwZhtt86-R-EFjRAQ,504
|
|
4
|
+
task_pool/core.py,sha256=EbyeglC2hdvEg-NIpS6iSJwBvir9wGCoIT1-2tVggs0,9224
|
|
5
|
+
task_pool/models.py,sha256=sZu2qRnM6lh6EalA0FcQ_fuF3a8FCidru8_J4nsNw4o,946
|
|
6
|
+
task_pool/stores/__init__.py,sha256=vDk1XQv2FDvMO8-Gs0qnMAySw0ovtP6pjkDiSnvLYmE,361
|
|
7
|
+
task_pool/stores/base.py,sha256=5u3o0wAAVZk63EAy4guFg8dN0iGMizUmdx8Rp8ifCJc,761
|
|
8
|
+
task_pool/stores/csv_file.py,sha256=SSMWmPWyDWGh8UXW33KpNJMZMD5EHxpdXwERLixaejI,1421
|
|
9
|
+
task_pool/stores/json_file.py,sha256=S7Fy9SULT_xMxL_qbfPc39bD21S-5YAj2j0uZtWM89w,6242
|
|
10
|
+
task_pool/stores/source_backed.py,sha256=nOTxksL_UZ5y-c8WLka1Z6BxV2m3Nf5T6dx-xbqjFUk,2194
|
|
11
|
+
task_pool/stores/sqlite.py,sha256=jLoelduviPpn9QJbmVlYFqqJXrn-t8LW2GcPwA1MVCw,8121
|
|
12
|
+
task_pool/stores/text_file.py,sha256=RUrjqU0J8SHKkhG0lu__oMqQW5b3lzxOIHqmchEkR5k,986
|
|
13
|
+
task_checkpoint-0.1.0.dist-info/METADATA,sha256=yh6IFydH7JWGz0_Hw4peLd2n0yFm5OxUqx32koLieP4,5549
|
|
14
|
+
task_checkpoint-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
15
|
+
task_checkpoint-0.1.0.dist-info/top_level.txt,sha256=DSMZD6PdiuVVyLiLFVGARWMEXOgBjhre5A9YdxwQO3s,10
|
|
16
|
+
task_checkpoint-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 1nvisibleCat
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
task_pool
|
task_pool/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from task_pool.codecs import JsonPayloadCodec
|
|
2
|
+
from task_pool.core import LeasedTask, TaskPool
|
|
3
|
+
from task_pool.models import Status, Task, TaskRecord
|
|
4
|
+
from task_pool.stores import (
|
|
5
|
+
CSVRowStore,
|
|
6
|
+
JSONFileStore,
|
|
7
|
+
SQLiteStore,
|
|
8
|
+
TaskStore,
|
|
9
|
+
TextLineStore,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"CSVRowStore",
|
|
17
|
+
"JSONFileStore",
|
|
18
|
+
"JsonPayloadCodec",
|
|
19
|
+
"LeasedTask",
|
|
20
|
+
"SQLiteStore",
|
|
21
|
+
"Status",
|
|
22
|
+
"Task",
|
|
23
|
+
"TaskPool",
|
|
24
|
+
"TaskRecord",
|
|
25
|
+
"TaskStore",
|
|
26
|
+
"TextLineStore",
|
|
27
|
+
"__version__",
|
|
28
|
+
]
|
task_pool/codecs.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JsonPayloadCodec:
|
|
6
|
+
def encode(self, payload: Any) -> str:
|
|
7
|
+
try:
|
|
8
|
+
return json.dumps(
|
|
9
|
+
payload,
|
|
10
|
+
sort_keys=True,
|
|
11
|
+
separators=(",", ":"),
|
|
12
|
+
ensure_ascii=False,
|
|
13
|
+
)
|
|
14
|
+
except (TypeError, ValueError) as exc:
|
|
15
|
+
raise TypeError("payload must be JSON-serializable") from exc
|
|
16
|
+
|
|
17
|
+
def decode(self, payload_data: str) -> Any:
|
|
18
|
+
return json.loads(payload_data)
|
task_pool/core.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Any, Callable, Optional
|
|
7
|
+
|
|
8
|
+
from task_pool.codecs import JsonPayloadCodec
|
|
9
|
+
from task_pool.models import Status, Task, TaskRecord
|
|
10
|
+
from task_pool.stores.base import TaskStore
|
|
11
|
+
from task_pool.stores.json_file import JSONFileStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LeasedTask:
|
|
15
|
+
def __init__(self, pool, task: Task):
|
|
16
|
+
self._pool = pool
|
|
17
|
+
self._task = task
|
|
18
|
+
self._closed = False
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def key(self):
|
|
22
|
+
return self._task.key
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def payload(self):
|
|
26
|
+
return self._task.payload
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def status(self):
|
|
30
|
+
return self._task.status
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def closed(self):
|
|
34
|
+
return self._closed
|
|
35
|
+
|
|
36
|
+
def commit(self):
|
|
37
|
+
self._pool.commit(self.key)
|
|
38
|
+
self._closed = True
|
|
39
|
+
|
|
40
|
+
def rollback(self):
|
|
41
|
+
self._pool.rollback(self.key)
|
|
42
|
+
self._closed = True
|
|
43
|
+
|
|
44
|
+
def to_task(self):
|
|
45
|
+
return self._task
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class _LeaseContext:
|
|
49
|
+
def __init__(self, pool):
|
|
50
|
+
self.pool = pool
|
|
51
|
+
self.task = None
|
|
52
|
+
|
|
53
|
+
def __enter__(self):
|
|
54
|
+
task = self.pool.acquire()
|
|
55
|
+
if task is None:
|
|
56
|
+
return None
|
|
57
|
+
self.task = LeasedTask(self.pool, task)
|
|
58
|
+
return self.task
|
|
59
|
+
|
|
60
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
61
|
+
if self.task is None or self.task.closed:
|
|
62
|
+
return False
|
|
63
|
+
if exc_type is None:
|
|
64
|
+
self.task.commit()
|
|
65
|
+
else:
|
|
66
|
+
self.task.rollback()
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class _AsyncLeaseContext:
|
|
71
|
+
def __init__(self, pool):
|
|
72
|
+
self.pool = pool
|
|
73
|
+
self.task = None
|
|
74
|
+
|
|
75
|
+
async def __aenter__(self):
|
|
76
|
+
task = await asyncio.to_thread(self.pool.acquire)
|
|
77
|
+
if task is None:
|
|
78
|
+
return None
|
|
79
|
+
self.task = LeasedTask(self.pool, task)
|
|
80
|
+
return self.task
|
|
81
|
+
|
|
82
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
83
|
+
if self.task is None or self.task.closed:
|
|
84
|
+
return False
|
|
85
|
+
if exc_type is None:
|
|
86
|
+
await asyncio.to_thread(self.task.commit)
|
|
87
|
+
else:
|
|
88
|
+
await asyncio.to_thread(self.task.rollback)
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TaskPool:
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
store: Optional[TaskStore] = None,
|
|
96
|
+
*,
|
|
97
|
+
unique_payload=True,
|
|
98
|
+
lease_timeout_seconds=None,
|
|
99
|
+
payload_codec=None,
|
|
100
|
+
):
|
|
101
|
+
if lease_timeout_seconds is not None and lease_timeout_seconds < 0:
|
|
102
|
+
raise ValueError("lease_timeout_seconds must be None or >= 0")
|
|
103
|
+
|
|
104
|
+
if store is None:
|
|
105
|
+
self.store = JSONFileStore()
|
|
106
|
+
elif isinstance(store, (str, os.PathLike)):
|
|
107
|
+
self.store = self._store_from_path(store)
|
|
108
|
+
else:
|
|
109
|
+
self.store = store
|
|
110
|
+
self.unique_payload = unique_payload
|
|
111
|
+
self.lease_timeout_seconds = lease_timeout_seconds
|
|
112
|
+
self.payload_codec = payload_codec or JsonPayloadCodec()
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def json(cls, path="task_pool.json", **kwargs):
|
|
116
|
+
return cls(JSONFileStore(path), **kwargs)
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def sqlite(cls, path="task_pool.db", **kwargs):
|
|
120
|
+
from task_pool.stores.sqlite import SQLiteStore
|
|
121
|
+
|
|
122
|
+
return cls(SQLiteStore(path), **kwargs)
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def csv(
|
|
126
|
+
cls,
|
|
127
|
+
path,
|
|
128
|
+
*,
|
|
129
|
+
has_header=True,
|
|
130
|
+
state_path=None,
|
|
131
|
+
encoding="utf-8",
|
|
132
|
+
**kwargs,
|
|
133
|
+
):
|
|
134
|
+
from task_pool.stores.csv_file import CSVRowStore
|
|
135
|
+
|
|
136
|
+
return cls(
|
|
137
|
+
CSVRowStore(
|
|
138
|
+
path,
|
|
139
|
+
has_header=has_header,
|
|
140
|
+
state_path=state_path,
|
|
141
|
+
encoding=encoding,
|
|
142
|
+
),
|
|
143
|
+
**kwargs,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def tsv(
|
|
148
|
+
cls,
|
|
149
|
+
path,
|
|
150
|
+
*,
|
|
151
|
+
has_header=True,
|
|
152
|
+
state_path=None,
|
|
153
|
+
encoding="utf-8",
|
|
154
|
+
**kwargs,
|
|
155
|
+
):
|
|
156
|
+
from task_pool.stores.csv_file import CSVRowStore
|
|
157
|
+
|
|
158
|
+
return cls(
|
|
159
|
+
CSVRowStore(
|
|
160
|
+
path,
|
|
161
|
+
delimiter="\t",
|
|
162
|
+
has_header=has_header,
|
|
163
|
+
state_path=state_path,
|
|
164
|
+
encoding=encoding,
|
|
165
|
+
),
|
|
166
|
+
**kwargs,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def text(
|
|
171
|
+
cls,
|
|
172
|
+
path,
|
|
173
|
+
*,
|
|
174
|
+
state_path=None,
|
|
175
|
+
encoding="utf-8",
|
|
176
|
+
skip_blank=True,
|
|
177
|
+
**kwargs,
|
|
178
|
+
):
|
|
179
|
+
from task_pool.stores.text_file import TextLineStore
|
|
180
|
+
|
|
181
|
+
return cls(
|
|
182
|
+
TextLineStore(
|
|
183
|
+
path,
|
|
184
|
+
state_path=state_path,
|
|
185
|
+
encoding=encoding,
|
|
186
|
+
skip_blank=skip_blank,
|
|
187
|
+
),
|
|
188
|
+
**kwargs,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def append(self, payload, key=None, override=False, unique_payload=None):
|
|
192
|
+
if unique_payload is None:
|
|
193
|
+
unique_payload = self.unique_payload
|
|
194
|
+
|
|
195
|
+
payload_data = self.payload_codec.encode(payload)
|
|
196
|
+
if key is None:
|
|
197
|
+
key = self._make_key(payload_data, unique_payload)
|
|
198
|
+
|
|
199
|
+
now = self._now()
|
|
200
|
+
record = TaskRecord(
|
|
201
|
+
key=key,
|
|
202
|
+
payload_data=payload_data,
|
|
203
|
+
status=Status.NOT_START,
|
|
204
|
+
created_at=now,
|
|
205
|
+
updated_at=now,
|
|
206
|
+
leased_at=None,
|
|
207
|
+
)
|
|
208
|
+
return self.store.add(record, override=override)
|
|
209
|
+
|
|
210
|
+
def acquire(self):
|
|
211
|
+
if self.lease_timeout_seconds is not None:
|
|
212
|
+
self.reset_stale(self.lease_timeout_seconds)
|
|
213
|
+
record = self.store.claim_next(now=self._now())
|
|
214
|
+
if record is None:
|
|
215
|
+
return None
|
|
216
|
+
return self._task_from_record(record)
|
|
217
|
+
|
|
218
|
+
def lease(self):
|
|
219
|
+
return _LeaseContext(self)
|
|
220
|
+
|
|
221
|
+
def alease(self):
|
|
222
|
+
return _AsyncLeaseContext(self)
|
|
223
|
+
|
|
224
|
+
def __iter__(self):
|
|
225
|
+
for task in self.iter_tasks():
|
|
226
|
+
yield task.payload
|
|
227
|
+
|
|
228
|
+
def iter_tasks(self):
|
|
229
|
+
while True:
|
|
230
|
+
with self.lease() as task:
|
|
231
|
+
if task is None:
|
|
232
|
+
break
|
|
233
|
+
yield task
|
|
234
|
+
|
|
235
|
+
def commit(self, key):
|
|
236
|
+
self.store.mark_done(key, now=self._now())
|
|
237
|
+
|
|
238
|
+
def rollback(self, key):
|
|
239
|
+
self.store.rollback(key, now=self._now())
|
|
240
|
+
|
|
241
|
+
def reset(self):
|
|
242
|
+
return self.store.reset(now=self._now())
|
|
243
|
+
|
|
244
|
+
def reset_stale(self, age_seconds):
|
|
245
|
+
if age_seconds < 0:
|
|
246
|
+
raise ValueError("age_seconds must be >= 0")
|
|
247
|
+
now = self._now()
|
|
248
|
+
return self.store.reset_stale(cutoff=now - age_seconds, now=now)
|
|
249
|
+
|
|
250
|
+
def count(self, status=None):
|
|
251
|
+
return self.store.count(self._status_value(status))
|
|
252
|
+
|
|
253
|
+
def stats(self):
|
|
254
|
+
counts = {status.value: self.count(status) for status in Status}
|
|
255
|
+
return {"total": self.count(), **counts}
|
|
256
|
+
|
|
257
|
+
def get_all(self, status=None):
|
|
258
|
+
return [
|
|
259
|
+
self._task_from_record(record)
|
|
260
|
+
for record in self.store.list(self._status_value(status))
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
def for_each(self, action: Callable[[Any], Any]):
|
|
264
|
+
while True:
|
|
265
|
+
with self.lease() as task:
|
|
266
|
+
if task is None:
|
|
267
|
+
break
|
|
268
|
+
action(task.payload)
|
|
269
|
+
|
|
270
|
+
def map(self, action: Callable[[Any], Any]):
|
|
271
|
+
results = []
|
|
272
|
+
while True:
|
|
273
|
+
with self.lease() as task:
|
|
274
|
+
if task is None:
|
|
275
|
+
break
|
|
276
|
+
results.append(action(task.payload))
|
|
277
|
+
return results
|
|
278
|
+
|
|
279
|
+
def _task_from_record(self, record):
|
|
280
|
+
return Task(
|
|
281
|
+
key=record.key,
|
|
282
|
+
payload=self.payload_codec.decode(record.payload_data),
|
|
283
|
+
status=record.status,
|
|
284
|
+
created_at=record.created_at,
|
|
285
|
+
updated_at=record.updated_at,
|
|
286
|
+
leased_at=record.leased_at,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def _make_key(payload_data, unique_payload):
|
|
291
|
+
seed = payload_data
|
|
292
|
+
if not unique_payload:
|
|
293
|
+
seed = f"{payload_data}:{uuid.uuid4().hex}"
|
|
294
|
+
return hashlib.sha256(seed.encode("utf-8")).hexdigest()[:12]
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def _store_from_path(path):
|
|
298
|
+
suffix = os.fspath(path).lower().rsplit(".", 1)
|
|
299
|
+
if len(suffix) == 1:
|
|
300
|
+
raise ValueError("store path must have a supported suffix")
|
|
301
|
+
suffix = "." + suffix[1]
|
|
302
|
+
|
|
303
|
+
if suffix == ".json":
|
|
304
|
+
return JSONFileStore(path)
|
|
305
|
+
if suffix in {".sqlite", ".sqlite3", ".db"}:
|
|
306
|
+
from task_pool.stores.sqlite import SQLiteStore
|
|
307
|
+
|
|
308
|
+
return SQLiteStore(path)
|
|
309
|
+
if suffix == ".csv":
|
|
310
|
+
from task_pool.stores.csv_file import CSVRowStore
|
|
311
|
+
|
|
312
|
+
return CSVRowStore(path)
|
|
313
|
+
if suffix == ".tsv":
|
|
314
|
+
from task_pool.stores.csv_file import CSVRowStore
|
|
315
|
+
|
|
316
|
+
return CSVRowStore(path, delimiter="\t")
|
|
317
|
+
if suffix == ".txt":
|
|
318
|
+
from task_pool.stores.text_file import TextLineStore
|
|
319
|
+
|
|
320
|
+
return TextLineStore(path)
|
|
321
|
+
raise ValueError(f"unsupported store suffix {suffix}")
|
|
322
|
+
|
|
323
|
+
@staticmethod
|
|
324
|
+
def _status_value(status):
|
|
325
|
+
if status is None:
|
|
326
|
+
return None
|
|
327
|
+
if isinstance(status, Status):
|
|
328
|
+
return status
|
|
329
|
+
if isinstance(status, str):
|
|
330
|
+
if status in Status._value2member_map_:
|
|
331
|
+
return Status(status)
|
|
332
|
+
name = status.upper()
|
|
333
|
+
if name in Status.__members__:
|
|
334
|
+
return Status[name]
|
|
335
|
+
raise ValueError(f"invalid status {status}")
|
|
336
|
+
|
|
337
|
+
@staticmethod
|
|
338
|
+
def _now():
|
|
339
|
+
return time.time()
|
task_pool/models.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Status(str, Enum):
|
|
7
|
+
NOT_START = "not_start"
|
|
8
|
+
PENDING = "pending"
|
|
9
|
+
COMMITTED = "committed"
|
|
10
|
+
|
|
11
|
+
def to_dict(self):
|
|
12
|
+
return self.value
|
|
13
|
+
|
|
14
|
+
def __str__(self):
|
|
15
|
+
return self.value
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class TaskRecord:
|
|
20
|
+
key: str
|
|
21
|
+
payload_data: str
|
|
22
|
+
status: Status
|
|
23
|
+
created_at: float = 0
|
|
24
|
+
updated_at: float = 0
|
|
25
|
+
leased_at: Optional[float] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class Task:
|
|
30
|
+
key: str
|
|
31
|
+
payload: Any
|
|
32
|
+
status: Status
|
|
33
|
+
created_at: float = 0
|
|
34
|
+
updated_at: float = 0
|
|
35
|
+
leased_at: Optional[float] = None
|
|
36
|
+
|
|
37
|
+
def to_dict(self):
|
|
38
|
+
return {
|
|
39
|
+
"key": self.key,
|
|
40
|
+
"payload": self.payload,
|
|
41
|
+
"status": self.status.value,
|
|
42
|
+
"created_at": self.created_at,
|
|
43
|
+
"updated_at": self.updated_at,
|
|
44
|
+
"leased_at": self.leased_at,
|
|
45
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from task_pool.stores.base import TaskStore
|
|
2
|
+
from task_pool.stores.csv_file import CSVRowStore
|
|
3
|
+
from task_pool.stores.json_file import JSONFileStore
|
|
4
|
+
from task_pool.stores.sqlite import SQLiteStore
|
|
5
|
+
from task_pool.stores.text_file import TextLineStore
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"TaskStore",
|
|
10
|
+
"CSVRowStore",
|
|
11
|
+
"JSONFileStore",
|
|
12
|
+
"SQLiteStore",
|
|
13
|
+
"TextLineStore",
|
|
14
|
+
]
|
task_pool/stores/base.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Optional, Protocol
|
|
2
|
+
|
|
3
|
+
from task_pool.models import Status, TaskRecord
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TaskStore(Protocol):
|
|
7
|
+
def add(self, record: TaskRecord, *, override: bool) -> str:
|
|
8
|
+
...
|
|
9
|
+
|
|
10
|
+
def claim_next(self, *, now: float) -> Optional[TaskRecord]:
|
|
11
|
+
...
|
|
12
|
+
|
|
13
|
+
def mark_done(self, key: str, *, now: float) -> None:
|
|
14
|
+
...
|
|
15
|
+
|
|
16
|
+
def rollback(self, key: str, *, now: float) -> None:
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
def reset(self, *, now: float) -> int:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
def reset_stale(self, *, cutoff: float, now: float) -> int:
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
def get(self, key: str) -> TaskRecord:
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
def count(self, status: Optional[Status] = None) -> int:
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
def list(self, status: Optional[Status] = None) -> list[TaskRecord]:
|
|
32
|
+
...
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from task_pool.stores.source_backed import SourceBackedStore
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CSVRowStore(SourceBackedStore):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
path,
|
|
11
|
+
*,
|
|
12
|
+
delimiter=",",
|
|
13
|
+
has_header=True,
|
|
14
|
+
state_path=None,
|
|
15
|
+
encoding="utf-8",
|
|
16
|
+
):
|
|
17
|
+
super().__init__(path, state_path=state_path)
|
|
18
|
+
self.delimiter = delimiter
|
|
19
|
+
self.has_header = has_header
|
|
20
|
+
self.encoding = encoding
|
|
21
|
+
self._import_rows()
|
|
22
|
+
|
|
23
|
+
def _import_rows(self):
|
|
24
|
+
self._import_payloads(self._read_rows())
|
|
25
|
+
|
|
26
|
+
def _read_rows(self):
|
|
27
|
+
if not os.path.exists(self.path):
|
|
28
|
+
raise FileNotFoundError(self.path)
|
|
29
|
+
with open(self.path, "r", newline="", encoding=self.encoding) as f:
|
|
30
|
+
if self.has_header:
|
|
31
|
+
reader = csv.DictReader(f, delimiter=self.delimiter)
|
|
32
|
+
if reader.fieldnames is None:
|
|
33
|
+
return
|
|
34
|
+
for row_number, row in enumerate(reader, start=1):
|
|
35
|
+
yield row_number, dict(row)
|
|
36
|
+
else:
|
|
37
|
+
reader = csv.reader(f, delimiter=self.delimiter)
|
|
38
|
+
for row_number, row in enumerate(reader, start=1):
|
|
39
|
+
yield row_number, self._unnamed_row_to_dict(row)
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _unnamed_row_to_dict(row):
|
|
43
|
+
return {
|
|
44
|
+
f"col{index}": value
|
|
45
|
+
for index, value in enumerate(row, start=1)
|
|
46
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import fcntl
|
|
8
|
+
|
|
9
|
+
from task_pool.models import Status, TaskRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JSONFileStore:
|
|
13
|
+
def __init__(self, path="task_pool.json"):
|
|
14
|
+
self.path = str(path)
|
|
15
|
+
self.lock_path = f"{self.path}.lock"
|
|
16
|
+
if not os.path.exists(self.path):
|
|
17
|
+
self._write_data({})
|
|
18
|
+
|
|
19
|
+
def add(self, record: TaskRecord, *, override: bool) -> str:
|
|
20
|
+
with self._locked_data() as data:
|
|
21
|
+
if override or record.key not in data:
|
|
22
|
+
data[record.key] = self._record_to_dict(record)
|
|
23
|
+
return record.key
|
|
24
|
+
|
|
25
|
+
def claim_next(self, *, now: float) -> Optional[TaskRecord]:
|
|
26
|
+
with self._locked_data() as data:
|
|
27
|
+
for key in sorted(
|
|
28
|
+
data,
|
|
29
|
+
key=lambda item: (data[item]["created_at"], item),
|
|
30
|
+
):
|
|
31
|
+
item = data[key]
|
|
32
|
+
if item["status"] == Status.NOT_START.value:
|
|
33
|
+
item["status"] = Status.PENDING.value
|
|
34
|
+
item["leased_at"] = now
|
|
35
|
+
item["updated_at"] = now
|
|
36
|
+
return self._dict_to_record(key, item)
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
def mark_done(self, key: str, *, now: float) -> None:
|
|
40
|
+
with self._locked_data() as data:
|
|
41
|
+
item = self._get_item(data, key)
|
|
42
|
+
self._require_status(item, key, Status.PENDING)
|
|
43
|
+
item["status"] = Status.COMMITTED.value
|
|
44
|
+
item["updated_at"] = now
|
|
45
|
+
item["leased_at"] = None
|
|
46
|
+
|
|
47
|
+
def rollback(self, key: str, *, now: float) -> None:
|
|
48
|
+
with self._locked_data() as data:
|
|
49
|
+
item = self._get_item(data, key)
|
|
50
|
+
item["status"] = Status.NOT_START.value
|
|
51
|
+
item["updated_at"] = now
|
|
52
|
+
item["leased_at"] = None
|
|
53
|
+
|
|
54
|
+
def reset(self, *, now: float) -> int:
|
|
55
|
+
with self._locked_data() as data:
|
|
56
|
+
for item in data.values():
|
|
57
|
+
item["status"] = Status.NOT_START.value
|
|
58
|
+
item["updated_at"] = now
|
|
59
|
+
item["leased_at"] = None
|
|
60
|
+
return len(data)
|
|
61
|
+
|
|
62
|
+
def reset_stale(self, *, cutoff: float, now: float) -> int:
|
|
63
|
+
count = 0
|
|
64
|
+
with self._locked_data() as data:
|
|
65
|
+
for item in data.values():
|
|
66
|
+
if (
|
|
67
|
+
item["status"] == Status.PENDING.value
|
|
68
|
+
and item["leased_at"] is not None
|
|
69
|
+
and item["leased_at"] <= cutoff
|
|
70
|
+
):
|
|
71
|
+
item["status"] = Status.NOT_START.value
|
|
72
|
+
item["updated_at"] = now
|
|
73
|
+
item["leased_at"] = None
|
|
74
|
+
count += 1
|
|
75
|
+
return count
|
|
76
|
+
|
|
77
|
+
def get(self, key: str) -> TaskRecord:
|
|
78
|
+
with self._locked_data(write=False) as data:
|
|
79
|
+
return self._dict_to_record(key, self._get_item(data, key))
|
|
80
|
+
|
|
81
|
+
def count(self, status: Optional[Status] = None) -> int:
|
|
82
|
+
with self._locked_data(write=False) as data:
|
|
83
|
+
if status is None:
|
|
84
|
+
return len(data)
|
|
85
|
+
return sum(1 for item in data.values() if item["status"] == status.value)
|
|
86
|
+
|
|
87
|
+
def list(self, status: Optional[Status] = None) -> list[TaskRecord]:
|
|
88
|
+
with self._locked_data(write=False) as data:
|
|
89
|
+
records = []
|
|
90
|
+
for key in sorted(data, key=lambda item: (data[item]["created_at"], item)):
|
|
91
|
+
item = data[key]
|
|
92
|
+
if status is None or item["status"] == status.value:
|
|
93
|
+
records.append(self._dict_to_record(key, item))
|
|
94
|
+
return records
|
|
95
|
+
|
|
96
|
+
@contextmanager
|
|
97
|
+
def _locked_data(self, write=True):
|
|
98
|
+
os.makedirs(os.path.dirname(os.path.abspath(self.path)), exist_ok=True)
|
|
99
|
+
with open(self.lock_path, "a+") as lock:
|
|
100
|
+
fcntl.lockf(lock, fcntl.LOCK_EX)
|
|
101
|
+
data = self._read_data()
|
|
102
|
+
try:
|
|
103
|
+
yield data
|
|
104
|
+
finally:
|
|
105
|
+
if write:
|
|
106
|
+
self._write_data(data)
|
|
107
|
+
fcntl.lockf(lock, fcntl.LOCK_UN)
|
|
108
|
+
|
|
109
|
+
def _read_data(self):
|
|
110
|
+
if not os.path.exists(self.path):
|
|
111
|
+
return {}
|
|
112
|
+
with open(self.path, "r", encoding="utf-8") as f:
|
|
113
|
+
return json.load(f)
|
|
114
|
+
|
|
115
|
+
def _write_data(self, data):
|
|
116
|
+
directory = os.path.dirname(os.path.abspath(self.path))
|
|
117
|
+
os.makedirs(directory, exist_ok=True)
|
|
118
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
119
|
+
prefix=".task_pool.",
|
|
120
|
+
suffix=".tmp",
|
|
121
|
+
dir=directory,
|
|
122
|
+
text=True,
|
|
123
|
+
)
|
|
124
|
+
try:
|
|
125
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
126
|
+
json.dump(data, f, ensure_ascii=False, sort_keys=True)
|
|
127
|
+
f.flush()
|
|
128
|
+
os.fsync(f.fileno())
|
|
129
|
+
os.replace(tmp_path, self.path)
|
|
130
|
+
except Exception:
|
|
131
|
+
try:
|
|
132
|
+
os.unlink(tmp_path)
|
|
133
|
+
except FileNotFoundError:
|
|
134
|
+
pass
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def _record_to_dict(record):
|
|
139
|
+
return {
|
|
140
|
+
"payload_data": record.payload_data,
|
|
141
|
+
"payload": json.loads(record.payload_data),
|
|
142
|
+
"status": record.status.value,
|
|
143
|
+
"created_at": record.created_at,
|
|
144
|
+
"updated_at": record.updated_at,
|
|
145
|
+
"leased_at": record.leased_at,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _dict_to_record(key, item):
|
|
150
|
+
payload_data = item.get("payload_data")
|
|
151
|
+
if payload_data is None:
|
|
152
|
+
payload_data = json.dumps(
|
|
153
|
+
item["payload"],
|
|
154
|
+
sort_keys=True,
|
|
155
|
+
separators=(",", ":"),
|
|
156
|
+
ensure_ascii=False,
|
|
157
|
+
)
|
|
158
|
+
return TaskRecord(
|
|
159
|
+
key=key,
|
|
160
|
+
payload_data=payload_data,
|
|
161
|
+
status=Status(item["status"]),
|
|
162
|
+
created_at=item.get("created_at", 0),
|
|
163
|
+
updated_at=item.get("updated_at", 0),
|
|
164
|
+
leased_at=item.get("leased_at"),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def _get_item(data, key):
|
|
169
|
+
try:
|
|
170
|
+
return data[key]
|
|
171
|
+
except KeyError:
|
|
172
|
+
raise KeyError(key) from None
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def _require_status(item, key, status):
|
|
176
|
+
if item["status"] != status.value:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
f"task {key} is {item['status']}, expected {status.value}"
|
|
179
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from task_pool.models import Status, TaskRecord
|
|
6
|
+
from task_pool.stores.json_file import JSONFileStore
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SourceBackedStore:
|
|
10
|
+
def __init__(self, path, *, state_path=None):
|
|
11
|
+
self.path = str(path)
|
|
12
|
+
self.state_path = state_path or f"{self.path}.task_pool.json"
|
|
13
|
+
self.state_store = JSONFileStore(self.state_path)
|
|
14
|
+
|
|
15
|
+
def add(self, record: TaskRecord, *, override: bool) -> str:
|
|
16
|
+
return self.state_store.add(record, override=override)
|
|
17
|
+
|
|
18
|
+
def claim_next(self, *, now: float):
|
|
19
|
+
return self.state_store.claim_next(now=now)
|
|
20
|
+
|
|
21
|
+
def mark_done(self, key: str, *, now: float) -> None:
|
|
22
|
+
self.state_store.mark_done(key, now=now)
|
|
23
|
+
|
|
24
|
+
def rollback(self, key: str, *, now: float) -> None:
|
|
25
|
+
self.state_store.rollback(key, now=now)
|
|
26
|
+
|
|
27
|
+
def reset(self, *, now: float) -> int:
|
|
28
|
+
return self.state_store.reset(now=now)
|
|
29
|
+
|
|
30
|
+
def reset_stale(self, *, cutoff: float, now: float) -> int:
|
|
31
|
+
return self.state_store.reset_stale(cutoff=cutoff, now=now)
|
|
32
|
+
|
|
33
|
+
def get(self, key: str) -> TaskRecord:
|
|
34
|
+
return self.state_store.get(key)
|
|
35
|
+
|
|
36
|
+
def count(self, status=None) -> int:
|
|
37
|
+
return self.state_store.count(status)
|
|
38
|
+
|
|
39
|
+
def list(self, status=None) -> list[TaskRecord]:
|
|
40
|
+
return self.state_store.list(status)
|
|
41
|
+
|
|
42
|
+
def _import_payloads(self, payloads):
|
|
43
|
+
now = time.time()
|
|
44
|
+
for row_number, payload in payloads:
|
|
45
|
+
payload_data = json.dumps(
|
|
46
|
+
payload,
|
|
47
|
+
sort_keys=True,
|
|
48
|
+
separators=(",", ":"),
|
|
49
|
+
ensure_ascii=False,
|
|
50
|
+
)
|
|
51
|
+
key = self._make_source_key(row_number, payload_data)
|
|
52
|
+
record = TaskRecord(
|
|
53
|
+
key=key,
|
|
54
|
+
payload_data=payload_data,
|
|
55
|
+
status=Status.NOT_START,
|
|
56
|
+
created_at=now + row_number / 1_000_000,
|
|
57
|
+
updated_at=now,
|
|
58
|
+
leased_at=None,
|
|
59
|
+
)
|
|
60
|
+
self.state_store.add(record, override=False)
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _make_source_key(row_number, payload_data):
|
|
64
|
+
seed = f"{row_number}:{payload_data}"
|
|
65
|
+
return hashlib.sha256(seed.encode("utf-8")).hexdigest()[:12]
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from task_pool.models import Status, TaskRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SQLiteStore:
|
|
9
|
+
def __init__(self, path="task_pool.db"):
|
|
10
|
+
self.path = str(path)
|
|
11
|
+
self._init_db()
|
|
12
|
+
|
|
13
|
+
def add(self, record: TaskRecord, *, override: bool) -> str:
|
|
14
|
+
with self._connection() as conn:
|
|
15
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
16
|
+
if override:
|
|
17
|
+
conn.execute(
|
|
18
|
+
"""
|
|
19
|
+
INSERT INTO tasks (
|
|
20
|
+
key, payload_data, status, created_at,
|
|
21
|
+
updated_at, leased_at
|
|
22
|
+
)
|
|
23
|
+
VALUES (?, ?, ?, ?, ?, NULL)
|
|
24
|
+
ON CONFLICT(key) DO UPDATE SET
|
|
25
|
+
payload_data = excluded.payload_data,
|
|
26
|
+
status = excluded.status,
|
|
27
|
+
updated_at = excluded.updated_at,
|
|
28
|
+
leased_at = NULL
|
|
29
|
+
""",
|
|
30
|
+
(
|
|
31
|
+
record.key,
|
|
32
|
+
record.payload_data,
|
|
33
|
+
record.status.value,
|
|
34
|
+
record.created_at,
|
|
35
|
+
record.updated_at,
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
conn.execute(
|
|
40
|
+
"""
|
|
41
|
+
INSERT OR IGNORE INTO tasks (
|
|
42
|
+
key, payload_data, status, created_at,
|
|
43
|
+
updated_at, leased_at
|
|
44
|
+
)
|
|
45
|
+
VALUES (?, ?, ?, ?, ?, NULL)
|
|
46
|
+
""",
|
|
47
|
+
(
|
|
48
|
+
record.key,
|
|
49
|
+
record.payload_data,
|
|
50
|
+
record.status.value,
|
|
51
|
+
record.created_at,
|
|
52
|
+
record.updated_at,
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
conn.commit()
|
|
56
|
+
return record.key
|
|
57
|
+
|
|
58
|
+
def claim_next(self, *, now: float) -> Optional[TaskRecord]:
|
|
59
|
+
with self._connection() as conn:
|
|
60
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
61
|
+
row = conn.execute(
|
|
62
|
+
"""
|
|
63
|
+
SELECT * FROM tasks
|
|
64
|
+
WHERE status = ?
|
|
65
|
+
ORDER BY created_at, key
|
|
66
|
+
LIMIT 1
|
|
67
|
+
""",
|
|
68
|
+
(Status.NOT_START.value,),
|
|
69
|
+
).fetchone()
|
|
70
|
+
if row is None:
|
|
71
|
+
conn.commit()
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
conn.execute(
|
|
75
|
+
"""
|
|
76
|
+
UPDATE tasks
|
|
77
|
+
SET status = ?, leased_at = ?, updated_at = ?
|
|
78
|
+
WHERE key = ?
|
|
79
|
+
""",
|
|
80
|
+
(Status.PENDING.value, now, now, row["key"]),
|
|
81
|
+
)
|
|
82
|
+
updated = conn.execute(
|
|
83
|
+
"SELECT * FROM tasks WHERE key = ?",
|
|
84
|
+
(row["key"],),
|
|
85
|
+
).fetchone()
|
|
86
|
+
conn.commit()
|
|
87
|
+
return self._row_to_record(updated)
|
|
88
|
+
|
|
89
|
+
def mark_done(self, key: str, *, now: float) -> None:
|
|
90
|
+
with self._connection() as conn:
|
|
91
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
92
|
+
row = self._ensure_exists(conn, key)
|
|
93
|
+
self._require_status(row, key, Status.PENDING)
|
|
94
|
+
conn.execute(
|
|
95
|
+
"""
|
|
96
|
+
UPDATE tasks
|
|
97
|
+
SET status = ?, updated_at = ?, leased_at = NULL
|
|
98
|
+
WHERE key = ?
|
|
99
|
+
""",
|
|
100
|
+
(Status.COMMITTED.value, now, key),
|
|
101
|
+
)
|
|
102
|
+
conn.commit()
|
|
103
|
+
|
|
104
|
+
def rollback(self, key: str, *, now: float) -> None:
|
|
105
|
+
with self._connection() as conn:
|
|
106
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
107
|
+
self._ensure_exists(conn, key)
|
|
108
|
+
conn.execute(
|
|
109
|
+
"""
|
|
110
|
+
UPDATE tasks
|
|
111
|
+
SET status = ?, updated_at = ?, leased_at = NULL
|
|
112
|
+
WHERE key = ?
|
|
113
|
+
""",
|
|
114
|
+
(Status.NOT_START.value, now, key),
|
|
115
|
+
)
|
|
116
|
+
conn.commit()
|
|
117
|
+
|
|
118
|
+
def reset(self, *, now: float) -> int:
|
|
119
|
+
with self._connection() as conn:
|
|
120
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
121
|
+
cur = conn.execute(
|
|
122
|
+
"""
|
|
123
|
+
UPDATE tasks
|
|
124
|
+
SET status = ?, updated_at = ?, leased_at = NULL
|
|
125
|
+
""",
|
|
126
|
+
(Status.NOT_START.value, now),
|
|
127
|
+
)
|
|
128
|
+
conn.commit()
|
|
129
|
+
return cur.rowcount
|
|
130
|
+
|
|
131
|
+
def reset_stale(self, *, cutoff: float, now: float) -> int:
|
|
132
|
+
with self._connection() as conn:
|
|
133
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
134
|
+
cur = conn.execute(
|
|
135
|
+
"""
|
|
136
|
+
UPDATE tasks
|
|
137
|
+
SET status = ?, updated_at = ?, leased_at = NULL
|
|
138
|
+
WHERE status = ? AND leased_at IS NOT NULL AND leased_at <= ?
|
|
139
|
+
""",
|
|
140
|
+
(
|
|
141
|
+
Status.NOT_START.value,
|
|
142
|
+
now,
|
|
143
|
+
Status.PENDING.value,
|
|
144
|
+
cutoff,
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
conn.commit()
|
|
148
|
+
return cur.rowcount
|
|
149
|
+
|
|
150
|
+
def get(self, key: str) -> TaskRecord:
|
|
151
|
+
with self._connection() as conn:
|
|
152
|
+
return self._row_to_record(self._ensure_exists(conn, key))
|
|
153
|
+
|
|
154
|
+
def count(self, status: Optional[Status] = None) -> int:
|
|
155
|
+
with self._connection() as conn:
|
|
156
|
+
if status is None:
|
|
157
|
+
row = conn.execute("SELECT COUNT(*) AS count FROM tasks").fetchone()
|
|
158
|
+
else:
|
|
159
|
+
row = conn.execute(
|
|
160
|
+
"SELECT COUNT(*) AS count FROM tasks WHERE status = ?",
|
|
161
|
+
(status.value,),
|
|
162
|
+
).fetchone()
|
|
163
|
+
return row["count"]
|
|
164
|
+
|
|
165
|
+
def list(self, status: Optional[Status] = None) -> list[TaskRecord]:
|
|
166
|
+
with self._connection() as conn:
|
|
167
|
+
if status is None:
|
|
168
|
+
rows = conn.execute(
|
|
169
|
+
"SELECT * FROM tasks ORDER BY created_at, key"
|
|
170
|
+
).fetchall()
|
|
171
|
+
else:
|
|
172
|
+
rows = conn.execute(
|
|
173
|
+
"""
|
|
174
|
+
SELECT * FROM tasks
|
|
175
|
+
WHERE status = ?
|
|
176
|
+
ORDER BY created_at, key
|
|
177
|
+
""",
|
|
178
|
+
(status.value,),
|
|
179
|
+
).fetchall()
|
|
180
|
+
return [self._row_to_record(row) for row in rows]
|
|
181
|
+
|
|
182
|
+
def _init_db(self):
|
|
183
|
+
with self._connection() as conn:
|
|
184
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
185
|
+
conn.execute("PRAGMA busy_timeout=30000")
|
|
186
|
+
conn.execute(
|
|
187
|
+
"""
|
|
188
|
+
CREATE TABLE IF NOT EXISTS tasks (
|
|
189
|
+
key TEXT PRIMARY KEY,
|
|
190
|
+
payload_data TEXT NOT NULL,
|
|
191
|
+
status TEXT NOT NULL,
|
|
192
|
+
created_at REAL NOT NULL,
|
|
193
|
+
updated_at REAL NOT NULL,
|
|
194
|
+
leased_at REAL
|
|
195
|
+
)
|
|
196
|
+
"""
|
|
197
|
+
)
|
|
198
|
+
conn.execute(
|
|
199
|
+
"""
|
|
200
|
+
CREATE INDEX IF NOT EXISTS idx_tasks_claim
|
|
201
|
+
ON tasks(status, created_at, key)
|
|
202
|
+
"""
|
|
203
|
+
)
|
|
204
|
+
conn.commit()
|
|
205
|
+
|
|
206
|
+
@contextmanager
|
|
207
|
+
def _connection(self):
|
|
208
|
+
conn = sqlite3.connect(
|
|
209
|
+
self.path,
|
|
210
|
+
timeout=30,
|
|
211
|
+
isolation_level=None,
|
|
212
|
+
)
|
|
213
|
+
conn.row_factory = sqlite3.Row
|
|
214
|
+
try:
|
|
215
|
+
conn.execute("PRAGMA busy_timeout=30000")
|
|
216
|
+
yield conn
|
|
217
|
+
finally:
|
|
218
|
+
conn.close()
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def _row_to_record(row):
|
|
222
|
+
return TaskRecord(
|
|
223
|
+
key=row["key"],
|
|
224
|
+
payload_data=row["payload_data"],
|
|
225
|
+
status=Status(row["status"]),
|
|
226
|
+
created_at=row["created_at"],
|
|
227
|
+
updated_at=row["updated_at"],
|
|
228
|
+
leased_at=row["leased_at"],
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def _ensure_exists(conn, key):
|
|
233
|
+
row = conn.execute(
|
|
234
|
+
"SELECT * FROM tasks WHERE key = ?",
|
|
235
|
+
(key,),
|
|
236
|
+
).fetchone()
|
|
237
|
+
if row is None:
|
|
238
|
+
raise KeyError(key)
|
|
239
|
+
return row
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
def _require_status(row, key, status):
|
|
243
|
+
if row["status"] != status.value:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f"task {key} is {row['status']}, expected {status.value}"
|
|
246
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from task_pool.stores.source_backed import SourceBackedStore
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TextLineStore(SourceBackedStore):
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
path,
|
|
10
|
+
*,
|
|
11
|
+
state_path=None,
|
|
12
|
+
encoding="utf-8",
|
|
13
|
+
skip_blank=True,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(path, state_path=state_path)
|
|
16
|
+
self.encoding = encoding
|
|
17
|
+
self.skip_blank = skip_blank
|
|
18
|
+
self._import_lines()
|
|
19
|
+
|
|
20
|
+
def _import_lines(self):
|
|
21
|
+
self._import_payloads(self._read_lines())
|
|
22
|
+
|
|
23
|
+
def _read_lines(self):
|
|
24
|
+
if not os.path.exists(self.path):
|
|
25
|
+
raise FileNotFoundError(self.path)
|
|
26
|
+
with open(self.path, "r", encoding=self.encoding) as f:
|
|
27
|
+
for line_number, line in enumerate(f, start=1):
|
|
28
|
+
line = line.rstrip("\r\n")
|
|
29
|
+
if self.skip_blank and not line:
|
|
30
|
+
continue
|
|
31
|
+
yield line_number, {
|
|
32
|
+
"line": line,
|
|
33
|
+
"line_number": line_number,
|
|
34
|
+
}
|