safe-state 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- safe_state-0.1.0/LICENSE +21 -0
- safe_state-0.1.0/PKG-INFO +366 -0
- safe_state-0.1.0/README.md +310 -0
- safe_state-0.1.0/pyproject.toml +66 -0
- safe_state-0.1.0/safe_state/__init__.py +64 -0
- safe_state-0.1.0/safe_state/checkpoint.py +168 -0
- safe_state-0.1.0/safe_state/core.py +358 -0
- safe_state-0.1.0/safe_state/exceptions.py +21 -0
- safe_state-0.1.0/safe_state/reconnect.py +267 -0
- safe_state-0.1.0/safe_state/serialization.py +112 -0
- safe_state-0.1.0/safe_state.egg-info/PKG-INFO +366 -0
- safe_state-0.1.0/safe_state.egg-info/SOURCES.txt +17 -0
- safe_state-0.1.0/safe_state.egg-info/dependency_links.txt +1 -0
- safe_state-0.1.0/safe_state.egg-info/requires.txt +6 -0
- safe_state-0.1.0/safe_state.egg-info/top_level.txt +1 -0
- safe_state-0.1.0/setup.cfg +4 -0
- safe_state-0.1.0/tests/test_checkpoint.py +148 -0
- safe_state-0.1.0/tests/test_core.py +233 -0
- safe_state-0.1.0/tests/test_reconnect.py +120 -0
safe_state-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishant Bhatte (IronFighter23)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: safe-state
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Resumable execution for Python. One decorator. Zero retry loops.
|
|
5
|
+
Author-email: Nishant Bhatte <ironfighter23@users.noreply.github.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Nishant Bhatte (IronFighter23)
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/IronFighter23/safe-state
|
|
29
|
+
Project-URL: Repository, https://github.com/IronFighter23/safe-state
|
|
30
|
+
Project-URL: Issues, https://github.com/IronFighter23/safe-state/issues
|
|
31
|
+
Project-URL: Changelog, https://github.com/IronFighter23/safe-state/blob/main/CHANGELOG.md
|
|
32
|
+
Keywords: checkpoint,resume,fault-tolerance,retry,serialization,dill,automation,batch-processing,decorator
|
|
33
|
+
Classifier: Development Status :: 4 - Beta
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: System Administrators
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
44
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
45
|
+
Classifier: Topic :: System :: Recovery Tools
|
|
46
|
+
Classifier: Topic :: Utilities
|
|
47
|
+
Requires-Python: >=3.9
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
License-File: LICENSE
|
|
50
|
+
Requires-Dist: dill>=0.3.7
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
53
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
54
|
+
Requires-Dist: requests>=2.28; extra == "dev"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
# safe-state
|
|
58
|
+
|
|
59
|
+
**Resumable execution for Python. One decorator. Zero retry loops.**
|
|
60
|
+
|
|
61
|
+
[](https://pypi.org/project/safe-state/)
|
|
62
|
+
[](https://pypi.org/project/safe-state/)
|
|
63
|
+
[](LICENSE)
|
|
64
|
+
|
|
65
|
+
You wrote a Python script that loops through 10,000 things — sending welcome
|
|
66
|
+
emails, downloading files, calling an API for each user in your database,
|
|
67
|
+
resizing images, scraping URLs. Somewhere around item 6,432 the network blips,
|
|
68
|
+
a rate-limit kicks in, or someone unplugs your laptop. Everything dies. You
|
|
69
|
+
have no idea what was done and what wasn't.
|
|
70
|
+
|
|
71
|
+
The usual fix is a thicket of `try/except` blocks, manual retry loops, a "last
|
|
72
|
+
processed ID" column in some side database, and a `--resume-from` CLI flag.
|
|
73
|
+
`safe-state` deletes all of that:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from safe_state import safe_state
|
|
77
|
+
|
|
78
|
+
@safe_state
|
|
79
|
+
def send_welcome_emails(users, mailer):
|
|
80
|
+
for user in users:
|
|
81
|
+
mailer.send(user.email, "Welcome!", render_template(user))
|
|
82
|
+
|
|
83
|
+
send_welcome_emails(load_users(), open_mailer())
|
|
84
|
+
# Crashes at user 6,432? Just run the script again. It skips the first 6,431
|
|
85
|
+
# and picks up at 6,432. No code changes needed.
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## What makes this hard (and why most checkpointing tools don't actually work)
|
|
91
|
+
|
|
92
|
+
Python's built-in `pickle` can serialize dictionaries, lists, integers, and most
|
|
93
|
+
plain objects. It **cannot** serialize:
|
|
94
|
+
|
|
95
|
+
- Open network sockets
|
|
96
|
+
- Live database connections (`sqlite3`, `psycopg2`, `pymongo`)
|
|
97
|
+
- Open file handles
|
|
98
|
+
- `requests.Session` objects with active TCP keep-alives
|
|
99
|
+
- Any object holding a C-level resource
|
|
100
|
+
|
|
101
|
+
So a naive "just pickle everything" checkpointer crashes the moment your script
|
|
102
|
+
holds anything useful. `safe-state` solves this with a **reconnect registry**:
|
|
103
|
+
when it finds a live object, it serializes a small metadata record describing
|
|
104
|
+
*how to recreate the object*, then rebuilds a fresh one on resume.
|
|
105
|
+
|
|
106
|
+
Built-in handlers ship for `sqlite3.Connection`, `socket.socket`,
|
|
107
|
+
`requests.Session`, and file handles. Custom types are a five-line
|
|
108
|
+
`register_reconnector()` call away.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Install
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
pip install safe-state
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Requires Python 3.9+ and `dill` (the only runtime dependency; `pickle` isn't
|
|
119
|
+
powerful enough on its own).
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## How it works
|
|
124
|
+
|
|
125
|
+
`@safe_state` does three things to the function it wraps:
|
|
126
|
+
|
|
127
|
+
1. **Intercepts the first iterable argument.** The function still sees a normal
|
|
128
|
+
iterable, but `safe-state` is silently tracking which items have completed.
|
|
129
|
+
2. **Persists progress after every item** (or every N items — configurable) to
|
|
130
|
+
a `.safestate` file on disk via an atomic write.
|
|
131
|
+
3. **Captures locals on failure.** When an exception escapes the function,
|
|
132
|
+
`safe-state` walks the traceback, grabs the local variables from the failing
|
|
133
|
+
frame, freezes them with `dill` plus the reconnect registry, and writes them
|
|
134
|
+
to the checkpoint. The exception then re-raises as normal — `safe-state`
|
|
135
|
+
never silently swallows errors.
|
|
136
|
+
|
|
137
|
+
On the next invocation with the same job name, the checkpoint is loaded,
|
|
138
|
+
already-completed indices are skipped, and the iteration resumes from where it
|
|
139
|
+
stopped.
|
|
140
|
+
|
|
141
|
+
On successful completion, the checkpoint file is deleted.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Full example: downloading 500 images
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
import requests
|
|
149
|
+
from safe_state import safe_state
|
|
150
|
+
|
|
151
|
+
@safe_state(name="image-scrape", verbose=True)
|
|
152
|
+
def download_all(urls, session):
|
|
153
|
+
for url in urls:
|
|
154
|
+
filename = url.split("/")[-1]
|
|
155
|
+
response = session.get(url, timeout=10)
|
|
156
|
+
response.raise_for_status()
|
|
157
|
+
with open(f"downloads/{filename}", "wb") as f:
|
|
158
|
+
f.write(response.content)
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
urls = open("urls.txt").read().splitlines()
|
|
162
|
+
download_all(urls, requests.Session())
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
**Run 1** — connection times out on file 234:
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
[safe_state] starting fresh job 'image-scrape'
|
|
169
|
+
[safe_state] 'image-scrape' failed at item 233:
|
|
170
|
+
ConnectionError: HTTPSConnectionPool... Read timed out.
|
|
171
|
+
Progress 233/500 saved to .safe_state/image-scrape.safestate
|
|
172
|
+
Traceback (most recent call last): ...
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Run 2** — same command, no flags, no edits:
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
[safe_state] resuming 'image-scrape': 233/500 done (run #2)
|
|
179
|
+
[safe_state] skip index 0 (done)
|
|
180
|
+
...
|
|
181
|
+
[safe_state] skip index 232 (done)
|
|
182
|
+
# resumes at item 233, completes through 499
|
|
183
|
+
[✓] Job complete. Checkpoint cleared.
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## More use cases
|
|
189
|
+
|
|
190
|
+
Anything that loops through a batch of work benefits from this:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
# Bulk database backfill
|
|
194
|
+
@safe_state(name="backfill-2026")
|
|
195
|
+
def backfill(user_ids, conn):
|
|
196
|
+
for uid in user_ids:
|
|
197
|
+
new_value = expensive_computation(uid)
|
|
198
|
+
conn.execute("UPDATE users SET score = ? WHERE id = ?", (new_value, uid))
|
|
199
|
+
conn.commit()
|
|
200
|
+
|
|
201
|
+
# Processing a giant CSV
|
|
202
|
+
@safe_state(name="csv-cleanup")
|
|
203
|
+
def clean_rows(rows, output_writer):
|
|
204
|
+
for row in rows:
|
|
205
|
+
cleaned = normalize(row)
|
|
206
|
+
output_writer.writerow(cleaned)
|
|
207
|
+
|
|
208
|
+
# Calling an API for every record
|
|
209
|
+
@safe_state(name="enrich-leads", save_every=10)
|
|
210
|
+
def enrich(leads, api_client):
|
|
211
|
+
for lead in leads:
|
|
212
|
+
data = api_client.lookup(lead.email)
|
|
213
|
+
lead.enriched_data = data
|
|
214
|
+
lead.save()
|
|
215
|
+
|
|
216
|
+
# Resizing thousands of images
|
|
217
|
+
@safe_state(name="thumbnails")
|
|
218
|
+
def make_thumbs(image_paths):
|
|
219
|
+
for path in image_paths:
|
|
220
|
+
img = Image.open(path)
|
|
221
|
+
img.thumbnail((256, 256))
|
|
222
|
+
img.save(path.replace(".jpg", "_thumb.jpg"))
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
In every case, if the script crashes partway, you just rerun it. No retry
|
|
226
|
+
logic, no progress columns, no resume flags.
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## API
|
|
231
|
+
|
|
232
|
+
### `@safe_state`
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
@safe_state(
|
|
236
|
+
name=None, # job identifier; defaults to fn.__qualname__
|
|
237
|
+
state_dir=".safe_state", # checkpoint directory
|
|
238
|
+
iterable_arg=0, # which arg is the iterable (int index or kwarg name)
|
|
239
|
+
save_every=1, # persist every N completed items
|
|
240
|
+
store_results=False, # also store each item's value (must be serializable)
|
|
241
|
+
keep_on_success=False, # keep checkpoint after successful completion
|
|
242
|
+
verbose=False, # print progress to stderr
|
|
243
|
+
auto_iterate=True, # set False for manual checkpoint() mode
|
|
244
|
+
)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
The decorator works with or without parentheses:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
@safe_state # equivalent to @safe_state()
|
|
251
|
+
def f(items): ...
|
|
252
|
+
|
|
253
|
+
@safe_state(name="custom")
|
|
254
|
+
def g(items): ...
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Inspecting checkpoints
|
|
258
|
+
|
|
259
|
+
Every decorated function exposes three helpers:
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
@safe_state
|
|
263
|
+
def my_job(items): ...
|
|
264
|
+
|
|
265
|
+
my_job.peek_checkpoint() # -> Checkpoint object, or None
|
|
266
|
+
my_job.clear_checkpoint() # -> deletes the .safestate file
|
|
267
|
+
my_job.checkpoint_path # -> Path to the .safestate file
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
A `Checkpoint` object holds:
|
|
271
|
+
|
|
272
|
+
- `completed_indices: set[int]`
|
|
273
|
+
- `total_items: int | None`
|
|
274
|
+
- `last_failure: dict | None` — exception type, message, traceback, index
|
|
275
|
+
- `frozen_state: bytes | None` — `dill`-serialized locals from the failing frame
|
|
276
|
+
- `run_count: int`
|
|
277
|
+
- `progress() -> dict` — human-readable summary
|
|
278
|
+
|
|
279
|
+
### Reconnect registry
|
|
280
|
+
|
|
281
|
+
Built-in handlers cover `sqlite3.Connection`, `socket.socket`,
|
|
282
|
+
`requests.Session`, and `io.IOBase` (file handles). To add your own:
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
from safe_state import register_reconnector
|
|
286
|
+
|
|
287
|
+
class MyApiClient:
|
|
288
|
+
def __init__(self, host, token):
|
|
289
|
+
self.host = host
|
|
290
|
+
self.token = token
|
|
291
|
+
self.session = open_some_session(host, token)
|
|
292
|
+
|
|
293
|
+
register_reconnector(
|
|
294
|
+
MyApiClient,
|
|
295
|
+
extract=lambda c: {"host": c.host, "token": c.token},
|
|
296
|
+
reconnect=lambda meta: MyApiClient(meta["host"], meta["token"]),
|
|
297
|
+
)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
That's it — any `MyApiClient` instance held in your function's locals will now
|
|
301
|
+
survive checkpoint/restore.
|
|
302
|
+
|
|
303
|
+
### Manual checkpointing (advanced)
|
|
304
|
+
|
|
305
|
+
If your function doesn't fit the "loop over items" mould — e.g. it processes a
|
|
306
|
+
graph or a single very long task — set `auto_iterate=False` and call
|
|
307
|
+
`checkpoint()` manually:
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
from safe_state import safe_state, checkpoint
|
|
311
|
+
|
|
312
|
+
@safe_state(auto_iterate=False)
|
|
313
|
+
def big_job(graph):
|
|
314
|
+
visited = set()
|
|
315
|
+
for node in graph.walk():
|
|
316
|
+
process(node)
|
|
317
|
+
visited.add(node.id)
|
|
318
|
+
checkpoint(visited=visited) # freeze progress here
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
---
|
|
322
|
+
|
|
323
|
+
## What `safe-state` is **not**
|
|
324
|
+
|
|
325
|
+
- **Not a distributed task queue.** For multi-machine job dispatch use Celery,
|
|
326
|
+
Dramatiq, or RQ. `safe-state` solves the much smaller problem of "this one
|
|
327
|
+
process crashed; let me rerun the same script and resume."
|
|
328
|
+
- **Not a transaction manager.** If your work involves multi-step database
|
|
329
|
+
state that needs rollback, use real transactions. `safe-state` checkpoints at
|
|
330
|
+
iteration boundaries; an item is either complete or it isn't.
|
|
331
|
+
- **Not magic.** It doesn't freeze CPython frames mid-instruction. The
|
|
332
|
+
iteration boundary is the resume granularity. If a single item's work is
|
|
333
|
+
itself a long pipeline, decompose it into smaller items.
|
|
334
|
+
|
|
335
|
+
---
|
|
336
|
+
|
|
337
|
+
## Performance
|
|
338
|
+
|
|
339
|
+
The default `save_every=1` writes a checkpoint after every iteration. For most
|
|
340
|
+
real workloads (network calls, DB writes) this is well under a millisecond of
|
|
341
|
+
overhead and totally invisible. If your inner loop is microsecond-scale, raise
|
|
342
|
+
`save_every` to batch progress flushes:
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
@safe_state(save_every=100)
|
|
346
|
+
def fast_loop(items):
|
|
347
|
+
for item in items:
|
|
348
|
+
cheap_in_memory_work(item)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## License
|
|
354
|
+
|
|
355
|
+
MIT. See [LICENSE](LICENSE).
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## Contributing
|
|
360
|
+
|
|
361
|
+
Issues and pull requests welcome. Run the test suite with:
|
|
362
|
+
|
|
363
|
+
```bash
|
|
364
|
+
pip install -e ".[dev]"
|
|
365
|
+
pytest
|
|
366
|
+
```
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
# safe-state
|
|
2
|
+
|
|
3
|
+
**Resumable execution for Python. One decorator. Zero retry loops.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/safe-state/)
|
|
6
|
+
[](https://pypi.org/project/safe-state/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
You wrote a Python script that loops through 10,000 things — sending welcome
|
|
10
|
+
emails, downloading files, calling an API for each user in your database,
|
|
11
|
+
resizing images, scraping URLs. Somewhere around item 6,432 the network blips,
|
|
12
|
+
a rate-limit kicks in, or someone unplugs your laptop. Everything dies. You
|
|
13
|
+
have no idea what was done and what wasn't.
|
|
14
|
+
|
|
15
|
+
The usual fix is a thicket of `try/except` blocks, manual retry loops, a "last
|
|
16
|
+
processed ID" column in some side database, and a `--resume-from` CLI flag.
|
|
17
|
+
`safe-state` deletes all of that:
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from safe_state import safe_state
|
|
21
|
+
|
|
22
|
+
@safe_state
|
|
23
|
+
def send_welcome_emails(users, mailer):
|
|
24
|
+
for user in users:
|
|
25
|
+
mailer.send(user.email, "Welcome!", render_template(user))
|
|
26
|
+
|
|
27
|
+
send_welcome_emails(load_users(), open_mailer())
|
|
28
|
+
# Crashes at user 6,432? Just run the script again. It skips the first 6,431
|
|
29
|
+
# and picks up at 6,432. No code changes needed.
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## What makes this hard (and why most checkpointing tools don't actually work)
|
|
35
|
+
|
|
36
|
+
Python's built-in `pickle` can serialize dictionaries, lists, integers, and most
|
|
37
|
+
plain objects. It **cannot** serialize:
|
|
38
|
+
|
|
39
|
+
- Open network sockets
|
|
40
|
+
- Live database connections (`sqlite3`, `psycopg2`, `pymongo`)
|
|
41
|
+
- Open file handles
|
|
42
|
+
- `requests.Session` objects with active TCP keep-alives
|
|
43
|
+
- Any object holding a C-level resource
|
|
44
|
+
|
|
45
|
+
So a naive "just pickle everything" checkpointer crashes the moment your script
|
|
46
|
+
holds anything useful. `safe-state` solves this with a **reconnect registry**:
|
|
47
|
+
when it finds a live object, it serializes a small metadata record describing
|
|
48
|
+
*how to recreate the object*, then rebuilds a fresh one on resume.
|
|
49
|
+
|
|
50
|
+
Built-in handlers ship for `sqlite3.Connection`, `socket.socket`,
|
|
51
|
+
`requests.Session`, and file handles. Custom types are a five-line
|
|
52
|
+
`register_reconnector()` call away.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install safe-state
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Requires Python 3.9+ and `dill` (the only runtime dependency; `pickle` isn't
|
|
63
|
+
powerful enough on its own).
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## How it works
|
|
68
|
+
|
|
69
|
+
`@safe_state` does three things to the function it wraps:
|
|
70
|
+
|
|
71
|
+
1. **Intercepts the first iterable argument.** The function still sees a normal
|
|
72
|
+
iterable, but `safe-state` is silently tracking which items have completed.
|
|
73
|
+
2. **Persists progress after every item** (or every N items — configurable) to
|
|
74
|
+
a `.safestate` file on disk via an atomic write.
|
|
75
|
+
3. **Captures locals on failure.** When an exception escapes the function,
|
|
76
|
+
`safe-state` walks the traceback, grabs the local variables from the failing
|
|
77
|
+
frame, freezes them with `dill` plus the reconnect registry, and writes them
|
|
78
|
+
to the checkpoint. The exception then re-raises as normal — `safe-state`
|
|
79
|
+
never silently swallows errors.
|
|
80
|
+
|
|
81
|
+
On the next invocation with the same job name, the checkpoint is loaded,
|
|
82
|
+
already-completed indices are skipped, and the iteration resumes from where it
|
|
83
|
+
stopped.
|
|
84
|
+
|
|
85
|
+
On successful completion, the checkpoint file is deleted.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Full example: downloading 500 images
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
import requests
|
|
93
|
+
from safe_state import safe_state
|
|
94
|
+
|
|
95
|
+
@safe_state(name="image-scrape", verbose=True)
|
|
96
|
+
def download_all(urls, session):
|
|
97
|
+
for url in urls:
|
|
98
|
+
filename = url.split("/")[-1]
|
|
99
|
+
response = session.get(url, timeout=10)
|
|
100
|
+
response.raise_for_status()
|
|
101
|
+
with open(f"downloads/{filename}", "wb") as f:
|
|
102
|
+
f.write(response.content)
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
urls = open("urls.txt").read().splitlines()
|
|
106
|
+
download_all(urls, requests.Session())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Run 1** — connection times out on file 234:
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
[safe_state] starting fresh job 'image-scrape'
|
|
113
|
+
[safe_state] 'image-scrape' failed at item 233:
|
|
114
|
+
ConnectionError: HTTPSConnectionPool... Read timed out.
|
|
115
|
+
Progress 233/500 saved to .safe_state/image-scrape.safestate
|
|
116
|
+
Traceback (most recent call last): ...
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Run 2** — same command, no flags, no edits:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
[safe_state] resuming 'image-scrape': 233/500 done (run #2)
|
|
123
|
+
[safe_state] skip index 0 (done)
|
|
124
|
+
...
|
|
125
|
+
[safe_state] skip index 232 (done)
|
|
126
|
+
# resumes at item 233, completes through 499
|
|
127
|
+
[✓] Job complete. Checkpoint cleared.
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## More use cases
|
|
133
|
+
|
|
134
|
+
Anything that loops through a batch of work benefits from this:
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
# Bulk database backfill
|
|
138
|
+
@safe_state(name="backfill-2026")
|
|
139
|
+
def backfill(user_ids, conn):
|
|
140
|
+
for uid in user_ids:
|
|
141
|
+
new_value = expensive_computation(uid)
|
|
142
|
+
conn.execute("UPDATE users SET score = ? WHERE id = ?", (new_value, uid))
|
|
143
|
+
conn.commit()
|
|
144
|
+
|
|
145
|
+
# Processing a giant CSV
|
|
146
|
+
@safe_state(name="csv-cleanup")
|
|
147
|
+
def clean_rows(rows, output_writer):
|
|
148
|
+
for row in rows:
|
|
149
|
+
cleaned = normalize(row)
|
|
150
|
+
output_writer.writerow(cleaned)
|
|
151
|
+
|
|
152
|
+
# Calling an API for every record
|
|
153
|
+
@safe_state(name="enrich-leads", save_every=10)
|
|
154
|
+
def enrich(leads, api_client):
|
|
155
|
+
for lead in leads:
|
|
156
|
+
data = api_client.lookup(lead.email)
|
|
157
|
+
lead.enriched_data = data
|
|
158
|
+
lead.save()
|
|
159
|
+
|
|
160
|
+
# Resizing thousands of images
|
|
161
|
+
@safe_state(name="thumbnails")
|
|
162
|
+
def make_thumbs(image_paths):
|
|
163
|
+
for path in image_paths:
|
|
164
|
+
img = Image.open(path)
|
|
165
|
+
img.thumbnail((256, 256))
|
|
166
|
+
img.save(path.replace(".jpg", "_thumb.jpg"))
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
In every case, if the script crashes partway, you just rerun it. No retry
|
|
170
|
+
logic, no progress columns, no resume flags.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## API
|
|
175
|
+
|
|
176
|
+
### `@safe_state`
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
@safe_state(
|
|
180
|
+
name=None, # job identifier; defaults to fn.__qualname__
|
|
181
|
+
state_dir=".safe_state", # checkpoint directory
|
|
182
|
+
iterable_arg=0, # which arg is the iterable (int index or kwarg name)
|
|
183
|
+
save_every=1, # persist every N completed items
|
|
184
|
+
store_results=False, # also store each item's value (must be serializable)
|
|
185
|
+
keep_on_success=False, # keep checkpoint after successful completion
|
|
186
|
+
verbose=False, # print progress to stderr
|
|
187
|
+
auto_iterate=True, # set False for manual checkpoint() mode
|
|
188
|
+
)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
The decorator works with or without parentheses:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
@safe_state # equivalent to @safe_state()
|
|
195
|
+
def f(items): ...
|
|
196
|
+
|
|
197
|
+
@safe_state(name="custom")
|
|
198
|
+
def g(items): ...
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Inspecting checkpoints
|
|
202
|
+
|
|
203
|
+
Every decorated function exposes three helpers:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
@safe_state
|
|
207
|
+
def my_job(items): ...
|
|
208
|
+
|
|
209
|
+
my_job.peek_checkpoint() # -> Checkpoint object, or None
|
|
210
|
+
my_job.clear_checkpoint() # -> deletes the .safestate file
|
|
211
|
+
my_job.checkpoint_path # -> Path to the .safestate file
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
A `Checkpoint` object holds:
|
|
215
|
+
|
|
216
|
+
- `completed_indices: set[int]`
|
|
217
|
+
- `total_items: int | None`
|
|
218
|
+
- `last_failure: dict | None` — exception type, message, traceback, index
|
|
219
|
+
- `frozen_state: bytes | None` — `dill`-serialized locals from the failing frame
|
|
220
|
+
- `run_count: int`
|
|
221
|
+
- `progress() -> dict` — human-readable summary
|
|
222
|
+
|
|
223
|
+
### Reconnect registry
|
|
224
|
+
|
|
225
|
+
Built-in handlers cover `sqlite3.Connection`, `socket.socket`,
|
|
226
|
+
`requests.Session`, and `io.IOBase` (file handles). To add your own:
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
from safe_state import register_reconnector
|
|
230
|
+
|
|
231
|
+
class MyApiClient:
|
|
232
|
+
def __init__(self, host, token):
|
|
233
|
+
self.host = host
|
|
234
|
+
self.token = token
|
|
235
|
+
self.session = open_some_session(host, token)
|
|
236
|
+
|
|
237
|
+
register_reconnector(
|
|
238
|
+
MyApiClient,
|
|
239
|
+
extract=lambda c: {"host": c.host, "token": c.token},
|
|
240
|
+
reconnect=lambda meta: MyApiClient(meta["host"], meta["token"]),
|
|
241
|
+
)
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
That's it — any `MyApiClient` instance held in your function's locals will now
|
|
245
|
+
survive checkpoint/restore.
|
|
246
|
+
|
|
247
|
+
### Manual checkpointing (advanced)
|
|
248
|
+
|
|
249
|
+
If your function doesn't fit the "loop over items" mould — e.g. it processes a
|
|
250
|
+
graph or a single very long task — set `auto_iterate=False` and call
|
|
251
|
+
`checkpoint()` manually:
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from safe_state import safe_state, checkpoint
|
|
255
|
+
|
|
256
|
+
@safe_state(auto_iterate=False)
|
|
257
|
+
def big_job(graph):
|
|
258
|
+
visited = set()
|
|
259
|
+
for node in graph.walk():
|
|
260
|
+
process(node)
|
|
261
|
+
visited.add(node.id)
|
|
262
|
+
checkpoint(visited=visited) # freeze progress here
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## What `safe-state` is **not**
|
|
268
|
+
|
|
269
|
+
- **Not a distributed task queue.** For multi-machine job dispatch use Celery,
|
|
270
|
+
Dramatiq, or RQ. `safe-state` solves the much smaller problem of "this one
|
|
271
|
+
process crashed; let me rerun the same script and resume."
|
|
272
|
+
- **Not a transaction manager.** If your work involves multi-step database
|
|
273
|
+
state that needs rollback, use real transactions. `safe-state` checkpoints at
|
|
274
|
+
iteration boundaries; an item is either complete or it isn't.
|
|
275
|
+
- **Not magic.** It doesn't freeze CPython frames mid-instruction. The
|
|
276
|
+
iteration boundary is the resume granularity. If a single item's work is
|
|
277
|
+
itself a long pipeline, decompose it into smaller items.
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## Performance
|
|
282
|
+
|
|
283
|
+
The default `save_every=1` writes a checkpoint after every iteration. For most
|
|
284
|
+
real workloads (network calls, DB writes) this is well under a millisecond of
|
|
285
|
+
overhead and totally invisible. If your inner loop is microsecond-scale, raise
|
|
286
|
+
`save_every` to batch progress flushes:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
@safe_state(save_every=100)
|
|
290
|
+
def fast_loop(items):
|
|
291
|
+
for item in items:
|
|
292
|
+
cheap_in_memory_work(item)
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT. See [LICENSE](LICENSE).
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## Contributing
|
|
304
|
+
|
|
305
|
+
Issues and pull requests welcome. Run the test suite with:
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
pip install -e ".[dev]"
|
|
309
|
+
pytest
|
|
310
|
+
```
|