pypelite 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pypelite-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 pypelite contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: pypelite
3
+ Version: 0.1.0
4
+ Summary: Tiny pipeline library for ordinary Python scripts.
5
+ Author: pypelite contributors
6
+ License-Expression: MIT
7
+ Keywords: pipeline,cache,workflow,data
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
+ Requires-Python: >=3.12
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Dynamic: license-file
18
+
19
+ # pypelite
20
+
21
+ _Pypelite is a tiny pipeline library for ordinary Python scripts._
22
+
23
+ ```python
24
+ import pypelite
25
+
26
+ @pypelite.stage("load", batch="symbols", batch_size=50, workers=4)
27
+ def load_prices(symbols):
28
+ return market_api.fetch_prices(symbols)
29
+
30
+ @pypelite.stage("features")
31
+ def build_features(prices_df):
32
+ return make_model_features(prices_df)
33
+
34
+ @pypelite.stage("train")
35
+ def train_model(features_df):
36
+ return fit_price_model(features_df)
37
+
38
+ with pypelite.pipeline("runs/price-model"):
39
+ prices_df = load_prices(["AAPL", "MSFT", "NVDA"])
40
+ features_df = build_features(prices_df)
41
+ model = train_model(features_df)
42
+ ```
43
+
44
+ Pipelines are resumable: completed stages load from disk, and selected stages
45
+ can be refreshed when they need to run again. No DSL, no DAG boilerplate, no
46
+ Airflow deployment. The Python code is the pipeline.
47
+
48
+ ## Installation
49
+
50
+ ```sh
51
+ pip install pypelite
52
+ ```
53
+
54
+ ## Refresh, Skip, Clean
55
+
56
+ Control a run from the pipeline context.
57
+
58
+ ```python
59
+ with pypelite.pipeline(
60
+ "runs/experiment",
61
+ refresh=["features"],
62
+ skip=["train"],
63
+ clean=["predict"],
64
+ until="features",
65
+ ):
66
+ run_price_model()
67
+ ```
68
+
69
+ - `refresh` recomputes named stages touched by the run.
70
+ - `skip` returns a stage's `skip_value` without touching its cache.
71
+ - `clean` removes old keyed artifacts not touched by a successful run.
72
+ - `until` stops after the named stage completes.
73
+
74
+ ## Advanced Features
75
+
76
+ ### Item Caches
77
+
78
+ Use `key` when each item should have its own saved result.
79
+
80
+ ```python
81
+ @pypelite.stage("predict", key="symbol")
82
+ def predict_price(symbol, feature_row):
83
+ return model.predict(feature_row)
84
+
85
+ with pypelite.pipeline("runs/predictions"):
86
+ for symbol, feature_row in features_df.iterrows():
87
+ predict_price(symbol, feature_row)
88
+ ```
89
+
90
+ ### Fanout
91
+
92
+ Use `workers` when fanout should run in parallel.
93
+
94
+ ```python
95
+ @pypelite.stage("features", key="symbol", batch="symbols", workers=4)
96
+ def build_symbol_features(symbols):
97
+ prices_df = market_api.fetch_prices(symbols)
98
+ return make_model_features_by_symbol(prices_df)
99
+
100
+ with pypelite.pipeline("runs/features"):
101
+ feature_rows = build_symbol_features(["AAPL", "MSFT", "NVDA"])
102
+ ```
103
+
104
+ ### Batches
105
+
106
+ Use `batch` when work should run in chunks.
107
+
108
+ ```python
109
+ @pypelite.stage("predict", key="symbol", batch="rows", batch_size=200)
110
+ def predict_prices(rows):
111
+ return model_api.batch_predict(rows)
112
+
113
+ with pypelite.pipeline("runs/predictions"):
114
+ predictions = predict_prices(feature_rows)
115
+ ```
116
+
117
+ Batch stages can run chunks in parallel with `workers`. Unkeyed batches
118
+ assemble one final artifact; keyed batches save one result per key.
119
+
120
+ ### Shared Archives
121
+
122
+ Stages can choose named archives.
123
+
124
+ ```python
125
+ @pypelite.stage("prices", archive="market", key=("date", "symbol"))
126
+ def prices(date, symbol):
127
+ return market_api.price(date, symbol)
128
+
129
+ with pypelite.pipeline(
130
+ archives={"default": "runs/model", "market": "archive/market"},
131
+ ):
132
+ run_price_model()
133
+ ```
134
+
135
+ Shared archives make it easy for many experiments to reuse the same market
136
+ data while keeping model outputs in their own run directories.
@@ -0,0 +1,118 @@
1
+ # pypelite
2
+
3
+ _Pypelite is a tiny pipeline library for ordinary Python scripts._
4
+
5
+ ```python
6
+ import pypelite
7
+
8
+ @pypelite.stage("load", batch="symbols", batch_size=50, workers=4)
9
+ def load_prices(symbols):
10
+ return market_api.fetch_prices(symbols)
11
+
12
+ @pypelite.stage("features")
13
+ def build_features(prices_df):
14
+ return make_model_features(prices_df)
15
+
16
+ @pypelite.stage("train")
17
+ def train_model(features_df):
18
+ return fit_price_model(features_df)
19
+
20
+ with pypelite.pipeline("runs/price-model"):
21
+ prices_df = load_prices(["AAPL", "MSFT", "NVDA"])
22
+ features_df = build_features(prices_df)
23
+ model = train_model(features_df)
24
+ ```
25
+
26
+ Pipelines are resumable: completed stages load from disk, and selected stages
27
+ can be refreshed when they need to run again. No DSL, no DAG boilerplate, no
28
+ Airflow deployment. The Python code is the pipeline.
29
+
30
+ ## Installation
31
+
32
+ ```sh
33
+ pip install pypelite
34
+ ```
35
+
36
+ ## Refresh, Skip, Clean
37
+
38
+ Control a run from the pipeline context.
39
+
40
+ ```python
41
+ with pypelite.pipeline(
42
+ "runs/experiment",
43
+ refresh=["features"],
44
+ skip=["train"],
45
+ clean=["predict"],
46
+ until="features",
47
+ ):
48
+ run_price_model()
49
+ ```
50
+
51
+ - `refresh` recomputes named stages touched by the run.
52
+ - `skip` returns a stage's `skip_value` without touching its cache.
53
+ - `clean` removes old keyed artifacts not touched by a successful run.
54
+ - `until` stops after the named stage completes.
55
+
56
+ ## Advanced Features
57
+
58
+ ### Item Caches
59
+
60
+ Use `key` when each item should have its own saved result.
61
+
62
+ ```python
63
+ @pypelite.stage("predict", key="symbol")
64
+ def predict_price(symbol, feature_row):
65
+ return model.predict(feature_row)
66
+
67
+ with pypelite.pipeline("runs/predictions"):
68
+ for symbol, feature_row in features_df.iterrows():
69
+ predict_price(symbol, feature_row)
70
+ ```
71
+
72
+ ### Fanout
73
+
74
+ Use `workers` when fanout should run in parallel.
75
+
76
+ ```python
77
+ @pypelite.stage("features", key="symbol", batch="symbols", workers=4)
78
+ def build_symbol_features(symbols):
79
+ prices_df = market_api.fetch_prices(symbols)
80
+ return make_model_features_by_symbol(prices_df)
81
+
82
+ with pypelite.pipeline("runs/features"):
83
+ feature_rows = build_symbol_features(["AAPL", "MSFT", "NVDA"])
84
+ ```
85
+
86
+ ### Batches
87
+
88
+ Use `batch` when work should run in chunks.
89
+
90
+ ```python
91
+ @pypelite.stage("predict", key="symbol", batch="rows", batch_size=200)
92
+ def predict_prices(rows):
93
+ return model_api.batch_predict(rows)
94
+
95
+ with pypelite.pipeline("runs/predictions"):
96
+ predictions = predict_prices(feature_rows)
97
+ ```
98
+
99
+ Batch stages can run chunks in parallel with `workers`. Unkeyed batches
100
+ assemble one final artifact; keyed batches save one result per key.
101
+
102
+ ### Shared Archives
103
+
104
+ Stages can choose named archives.
105
+
106
+ ```python
107
+ @pypelite.stage("prices", archive="market", key=("date", "symbol"))
108
+ def prices(date, symbol):
109
+ return market_api.price(date, symbol)
110
+
111
+ with pypelite.pipeline(
112
+ archives={"default": "runs/model", "market": "archive/market"},
113
+ ):
114
+ run_price_model()
115
+ ```
116
+
117
+ Shared archives make it easy for many experiments to reuse the same market
118
+ data while keeping model outputs in their own run directories.
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pypelite"
7
+ version = "0.1.0"
8
+ description = "Tiny pipeline library for ordinary Python scripts."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "pypelite contributors" }]
14
+ dependencies = []
15
+ keywords = ["pipeline", "cache", "workflow", "data"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Software Development :: Libraries :: Python Modules",
23
+ ]
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
27
+
28
+ [tool.pytest.ini_options]
29
+ testpaths = ["tests"]
30
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,326 @@
1
+ """Tiny pipeline library for ordinary Python scripts."""
2
+
3
+ import concurrent.futures
4
+ import contextlib
5
+ import contextvars
6
+ import dataclasses
7
+ import enum
8
+ import functools
9
+ import hashlib
10
+ import inspect
11
+ import itertools
12
+ import pathlib
13
+ import pickle
14
+ import shutil
15
+ import tempfile
16
+
17
+ __version__ = "0.1.0"
18
+ __all__ = ["MultiprocessingType", "pipeline", "stage"]
19
+ _ACTIVE_PIPELINE = contextvars.ContextVar("pypelite_active_pipeline")
20
+
21
+
22
+ class _PipelineComplete(Exception):
23
+ pass
24
+
25
+
26
+ class MultiprocessingType(enum.Enum):
27
+ THREAD = "thread"
28
+ PROCESS = "process"
29
+
30
+ def pool(self, workers):
31
+ if self == self.THREAD:
32
+ return concurrent.futures.ThreadPoolExecutor(workers)
33
+ return concurrent.futures.ProcessPoolExecutor(workers)
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class StageState:
38
+ stage: object
39
+ path: pathlib.Path
40
+ refresh: bool
41
+ touched: set
42
+
43
+ def artifact_dir(self, key):
44
+ return self.path / hashlib.sha256(pickle.dumps(key)).hexdigest()
45
+
46
+
47
+ def _names(value):
48
+ return (
49
+ set()
50
+ if value is None
51
+ else {value} if isinstance(value, str) else set(value)
52
+ )
53
+
54
+
55
+ def _dump(path, value):
56
+ path.parent.mkdir(parents=True, exist_ok=True)
57
+ with tempfile.NamedTemporaryFile(delete=False, dir=path.parent) as f:
58
+ pickle.dump(value, f, protocol=pickle.HIGHEST_PROTOCOL)
59
+ temp_path = f.name
60
+ pathlib.Path(temp_path).replace(path)
61
+
62
+
63
+ class Pipeline:
64
+ def __init__(self, path=None, **config):
65
+ archives = config.get("archives") or {"default": path}
66
+ if not isinstance(archives, dict):
67
+ archives = {"default": archives}
68
+ self.archives = {
69
+ name: pathlib.Path(path) for name, path in archives.items()
70
+ }
71
+ self.refresh = _names(config.get("refresh"))
72
+ self.clean = _names(config.get("clean"))
73
+ self.skip = _names(config.get("skip"))
74
+ self.until = config.get("until")
75
+ self.stage_state = {}
76
+ for path in self.archives.values():
77
+ path.mkdir(parents=True, exist_ok=True)
78
+
79
+ def state_for(self, stage):
80
+ if state := self.stage_state.get(stage.name):
81
+ return state
82
+ self.stage_state[stage.name] = StageState(
83
+ stage,
84
+ self.archives[stage.archive] / stage.name,
85
+ "all" in self.refresh or stage.name in self.refresh,
86
+ set(),
87
+ )
88
+ return self.stage_state[stage.name]
89
+
90
+ def finish(self, completed):
91
+ if not completed:
92
+ return
93
+ for state in self.stage_state.values():
94
+ if state.touched and (
95
+ "all" in self.clean or state.stage.name in self.clean
96
+ ):
97
+ for artifact in state.path.glob("*"):
98
+ if artifact.name not in state.touched:
99
+ shutil.rmtree(artifact)
100
+
101
+
102
+ class Stage:
103
+ def __init__(self, generate, **config):
104
+ functools.update_wrapper(self, generate)
105
+ self.generate = generate
106
+ self.name = config.get("name") or generate.__name__
107
+ self.archive = config.get("archive", "default")
108
+ self.key = config.get("key")
109
+ self.batch_size = config.get("batch_size")
110
+ self.skip_value = config.get("skip_value")
111
+ self.workers = config.get("workers")
112
+ self.executor = config.get("executor") or MultiprocessingType.THREAD
113
+ if not isinstance(self.executor, MultiprocessingType):
114
+ self.executor = MultiprocessingType(self.executor)
115
+ self.signature = inspect.signature(generate)
116
+ batch = config.get("batch")
117
+ self.batch = (
118
+ next(iter(self.signature.parameters)) if batch is True else batch
119
+ )
120
+
121
+ def __call__(self, *args, **kwargs):
122
+ ctx = _ACTIVE_PIPELINE.get(None)
123
+ if ctx is None:
124
+ raise RuntimeError("pypelite.stage called outside pipeline()")
125
+ if self.name in ctx.skip:
126
+ value = self.skip_value
127
+ else:
128
+ value = self.run(ctx.state_for(self), args, kwargs)
129
+ if ctx.until == self.name:
130
+ raise _PipelineComplete
131
+ return value
132
+
133
+ def bound(self, args, kwargs):
134
+ bound = self.signature.bind(*args, **kwargs)
135
+ bound.apply_defaults()
136
+ return bound.arguments
137
+
138
+ def run(self, state, args, kwargs):
139
+ artifact_dir = state.artifact_dir(self.bound(args, kwargs))
140
+ out = artifact_dir / "artifact.pkl"
141
+ if out.exists() and not state.refresh:
142
+ return pickle.loads(out.read_bytes())
143
+ value = self.generate(*args, **kwargs)
144
+ _dump(out, value)
145
+ return value
146
+
147
+
148
+ class KeyedStage(Stage):
149
+ def key_value(self, args, kwargs):
150
+ arguments = self.bound(args, kwargs)
151
+ if callable(self.key):
152
+ return self.key(**arguments)
153
+ if isinstance(self.key, str):
154
+ return arguments[self.key]
155
+ return tuple(arguments[name] for name in self.key)
156
+
157
+ def run(self, state, args, kwargs):
158
+ key = self.key_value(args, kwargs)
159
+ artifact_dir = state.artifact_dir(key)
160
+ out = artifact_dir / "artifact.pkl"
161
+ if out.exists() and not state.refresh:
162
+ value = pickle.loads(out.read_bytes())
163
+ else:
164
+ value = self.generate(*args, **kwargs)
165
+ _dump(artifact_dir / "key.pkl", key)
166
+ _dump(out, value)
167
+ state.touched.add(artifact_dir.name)
168
+ return value
169
+
170
+
171
+ class BatchStage(Stage):
172
+ def run_batches(self, chunks, args_d):
173
+ if not self.workers:
174
+ for chunk in chunks:
175
+ yield self.generate(**{**args_d, self.batch: list(chunk)})
176
+ return
177
+ with self.executor.pool(self.workers) as worker_pool:
178
+ futures = [
179
+ worker_pool.submit(
180
+ self.generate, **{**args_d, self.batch: list(chunk)}
181
+ )
182
+ for chunk in chunks
183
+ ]
184
+ for future in futures:
185
+ yield future.result()
186
+
187
+
188
+ class BatchCollectionStage(BatchStage):
189
+ def run(self, state, args, kwargs):
190
+ args_d = self.bound(args, kwargs)
191
+ artifact_dir = state.artifact_dir(args_d)
192
+ final = artifact_dir / "artifact.pkl"
193
+ if final.exists() and not state.refresh:
194
+ return pickle.loads(final.read_bytes())
195
+ rows = list(args_d[self.batch])
196
+ size = self.batch_size or len(rows) or 1
197
+ chunks = list(enumerate(itertools.batched(rows, size)))
198
+ values = [None] * len(chunks)
199
+ missing = []
200
+ for index, chunk in chunks:
201
+ part = artifact_dir / "chunks" / str(index)
202
+ out = part / "artifact.pkl"
203
+ if out.exists() and not state.refresh:
204
+ values[index] = pickle.loads(out.read_bytes())
205
+ else:
206
+ missing.append((index, chunk, part))
207
+ for (index, _chunk, part), batch_value in zip(
208
+ missing,
209
+ self.run_batches(
210
+ (chunk for _index, chunk, _part in missing), args_d
211
+ ),
212
+ ):
213
+ values[index] = batch_value
214
+ _dump(part / "artifact.pkl", batch_value)
215
+ value = list(itertools.chain.from_iterable(values))
216
+ _dump(final, value)
217
+ return value
218
+
219
+
220
+ class BatchKeyedStage(BatchStage, KeyedStage):
221
+ def item_key(self, record):
222
+ if callable(self.key):
223
+ return self.key(record)
224
+ if isinstance(record, dict):
225
+ if isinstance(self.key, str):
226
+ return record[self.key]
227
+ return tuple(record[name] for name in self.key)
228
+ if isinstance(self.key, str):
229
+ return getattr(record, self.key)
230
+ return tuple(getattr(record, name) for name in self.key)
231
+
232
+ def run(self, state, args, kwargs):
233
+ args_d = self.bound(args, kwargs)
234
+ rows = list(args_d[self.batch])
235
+ keys = [self.item_key(row) for row in rows]
236
+ artifact_dirs = [state.artifact_dir(key) for key in keys]
237
+ results = [None] * len(rows)
238
+ missing = []
239
+ for index, artifact_dir in enumerate(artifact_dirs):
240
+ out = artifact_dir / "artifact.pkl"
241
+ if out.exists() and not state.refresh:
242
+ results[index] = pickle.loads(out.read_bytes())
243
+ state.touched.add(artifact_dir.name)
244
+ else:
245
+ missing.append((index, rows[index]))
246
+ size = self.batch_size or len(missing) or 1
247
+ for indexes, values in zip(
248
+ itertools.batched([index for index, _row in missing], size),
249
+ self.run_batches(
250
+ itertools.batched([row for _index, row in missing], size),
251
+ args_d,
252
+ ),
253
+ ):
254
+ if len(indexes) != len(values):
255
+ raise ValueError("batch result length must match input length")
256
+ for index, value in zip(indexes, values):
257
+ artifact_dir = artifact_dirs[index]
258
+ _dump(artifact_dir / "key.pkl", keys[index])
259
+ _dump(artifact_dir / "artifact.pkl", value)
260
+ state.touched.add(artifact_dir.name)
261
+ results[index] = value
262
+ return results
263
+
264
+
265
+ def stage(
266
+ name=None,
267
+ *,
268
+ archive="default",
269
+ key=None,
270
+ batch=None,
271
+ batch_size=None,
272
+ skip_value=None,
273
+ workers=None,
274
+ executor=None,
275
+ ):
276
+ config = {
277
+ "archive": archive,
278
+ "key": key,
279
+ "batch": batch,
280
+ "batch_size": batch_size,
281
+ "skip_value": skip_value,
282
+ "workers": workers,
283
+ "executor": executor,
284
+ }
285
+ stage_class = Stage if key is None else KeyedStage
286
+ if batch is not None:
287
+ stage_class = BatchCollectionStage if key is None else BatchKeyedStage
288
+ if callable(name):
289
+ return stage_class(name)
290
+
291
+ def decorate(generate):
292
+ return stage_class(generate, name=name, **config)
293
+
294
+ return decorate
295
+
296
+
297
+ @contextlib.contextmanager
298
+ def pipeline(
299
+ path=None,
300
+ *,
301
+ archives=None,
302
+ refresh=None,
303
+ clean=None,
304
+ skip=None,
305
+ until=None,
306
+ ):
307
+ if _ACTIVE_PIPELINE.get(None) is not None:
308
+ raise RuntimeError("pypelite.pipeline contexts cannot be nested")
309
+ ctx = Pipeline(
310
+ path,
311
+ archives=archives,
312
+ refresh=refresh,
313
+ clean=clean,
314
+ skip=skip,
315
+ until=until,
316
+ )
317
+ token = _ACTIVE_PIPELINE.set(ctx)
318
+ completed = False
319
+ try:
320
+ yield ctx
321
+ completed = True
322
+ except _PipelineComplete:
323
+ completed = True
324
+ finally:
325
+ ctx.finish(completed)
326
+ _ACTIVE_PIPELINE.reset(token)
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: pypelite
3
+ Version: 0.1.0
4
+ Summary: Tiny pipeline library for ordinary Python scripts.
5
+ Author: pypelite contributors
6
+ License-Expression: MIT
7
+ Keywords: pipeline,cache,workflow,data
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
14
+ Requires-Python: >=3.12
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Dynamic: license-file
18
+
19
+ # pypelite
20
+
21
+ _Pypelite is a tiny pipeline library for ordinary Python scripts._
22
+
23
+ ```python
24
+ import pypelite
25
+
26
+ @pypelite.stage("load", batch="symbols", batch_size=50, workers=4)
27
+ def load_prices(symbols):
28
+ return market_api.fetch_prices(symbols)
29
+
30
+ @pypelite.stage("features")
31
+ def build_features(prices_df):
32
+ return make_model_features(prices_df)
33
+
34
+ @pypelite.stage("train")
35
+ def train_model(features_df):
36
+ return fit_price_model(features_df)
37
+
38
+ with pypelite.pipeline("runs/price-model"):
39
+ prices_df = load_prices(["AAPL", "MSFT", "NVDA"])
40
+ features_df = build_features(prices_df)
41
+ model = train_model(features_df)
42
+ ```
43
+
44
+ Pipelines are resumable: completed stages load from disk, and selected stages
45
+ can be refreshed when they need to run again. No DSL, no DAG boilerplate, no
46
+ Airflow deployment. The Python code is the pipeline.
47
+
48
+ ## Installation
49
+
50
+ ```sh
51
+ pip install pypelite
52
+ ```
53
+
54
+ ## Refresh, Skip, Clean
55
+
56
+ Control a run from the pipeline context.
57
+
58
+ ```python
59
+ with pypelite.pipeline(
60
+ "runs/experiment",
61
+ refresh=["features"],
62
+ skip=["train"],
63
+ clean=["predict"],
64
+ until="features",
65
+ ):
66
+ run_price_model()
67
+ ```
68
+
69
+ - `refresh` recomputes named stages touched by the run.
70
+ - `skip` returns a stage's `skip_value` without touching its cache.
71
+ - `clean` removes old keyed artifacts not touched by a successful run.
72
+ - `until` stops after the named stage completes.
73
+
74
+ ## Advanced Features
75
+
76
+ ### Item Caches
77
+
78
+ Use `key` when each item should have its own saved result.
79
+
80
+ ```python
81
+ @pypelite.stage("predict", key="symbol")
82
+ def predict_price(symbol, feature_row):
83
+ return model.predict(feature_row)
84
+
85
+ with pypelite.pipeline("runs/predictions"):
86
+ for symbol, feature_row in features_df.iterrows():
87
+ predict_price(symbol, feature_row)
88
+ ```
89
+
90
+ ### Fanout
91
+
92
+ Use `workers` when fanout should run in parallel.
93
+
94
+ ```python
95
+ @pypelite.stage("features", key="symbol", batch="symbols", workers=4)
96
+ def build_symbol_features(symbols):
97
+ prices_df = market_api.fetch_prices(symbols)
98
+ return make_model_features_by_symbol(prices_df)
99
+
100
+ with pypelite.pipeline("runs/features"):
101
+ feature_rows = build_symbol_features(["AAPL", "MSFT", "NVDA"])
102
+ ```
103
+
104
+ ### Batches
105
+
106
+ Use `batch` when work should run in chunks.
107
+
108
+ ```python
109
+ @pypelite.stage("predict", key="symbol", batch="rows", batch_size=200)
110
+ def predict_prices(rows):
111
+ return model_api.batch_predict(rows)
112
+
113
+ with pypelite.pipeline("runs/predictions"):
114
+ predictions = predict_prices(feature_rows)
115
+ ```
116
+
117
+ Batch stages can run chunks in parallel with `workers`. Unkeyed batches
118
+ assemble one final artifact; keyed batches save one result per key.
119
+
120
+ ### Shared Archives
121
+
122
+ Stages can choose named archives.
123
+
124
+ ```python
125
+ @pypelite.stage("prices", archive="market", key=("date", "symbol"))
126
+ def prices(date, symbol):
127
+ return market_api.price(date, symbol)
128
+
129
+ with pypelite.pipeline(
130
+ archives={"default": "runs/model", "market": "archive/market"},
131
+ ):
132
+ run_price_model()
133
+ ```
134
+
135
+ Shared archives make it easy for many experiments to reuse the same market
136
+ data while keeping model outputs in their own run directories.
@@ -0,0 +1,9 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/pypelite/__init__.py
5
+ src/pypelite.egg-info/PKG-INFO
6
+ src/pypelite.egg-info/SOURCES.txt
7
+ src/pypelite.egg-info/dependency_links.txt
8
+ src/pypelite.egg-info/top_level.txt
9
+ tests/test_pipeline.py
@@ -0,0 +1 @@
1
+ pypelite
@@ -0,0 +1,152 @@
1
+ import pypelite
2
+
3
+
4
+ def test_basic_pipeline_reuses_saved_stage_output(tmp_path):
5
+ version = {"value": 1}
6
+
7
+ @pypelite.stage("load")
8
+ def load_records(path):
9
+ return f"{path}:v{version['value']}"
10
+
11
+ @pypelite.stage("build")
12
+ def build_table(records):
13
+ return records.upper()
14
+
15
+ with pypelite.pipeline(tmp_path):
16
+ records = load_records("records.json")
17
+ assert build_table(records) == "RECORDS.JSON:V1"
18
+
19
+ version["value"] = 2
20
+
21
+ with pypelite.pipeline(tmp_path):
22
+ records = load_records("records.json")
23
+ assert build_table(records) == "RECORDS.JSON:V1"
24
+
25
+
26
+ def test_refresh_recomputes_named_stage(tmp_path):
27
+ version = {"value": 1}
28
+
29
+ @pypelite.stage("build")
30
+ def build_table(source):
31
+ return f"{source}:v{version['value']}"
32
+
33
+ with pypelite.pipeline(tmp_path):
34
+ assert build_table("records") == "records:v1"
35
+
36
+ version["value"] = 2
37
+
38
+ with pypelite.pipeline(tmp_path, refresh=["build"]):
39
+ assert build_table("records") == "records:v2"
40
+
41
+
42
+ def test_skip_returns_stage_skip_value(tmp_path):
43
+ @pypelite.stage("history", skip_value=None)
44
+ def history(case_id):
45
+ return f"history {case_id}"
46
+
47
+ with pypelite.pipeline(tmp_path, skip=["history"]):
48
+ assert history("a") is None
49
+
50
+
51
+ def test_clean_removes_old_keyed_results(tmp_path):
52
+ multiplier = {"value": 2}
53
+
54
+ @pypelite.stage("score", key="case_id")
55
+ def score(case_id, value):
56
+ return value * multiplier["value"]
57
+
58
+ with pypelite.pipeline(tmp_path):
59
+ assert score("a", 3) == 6
60
+ assert score("b", 4) == 8
61
+
62
+ with pypelite.pipeline(tmp_path, clean=["score"]):
63
+ assert score("b", 99) == 8
64
+
65
+ multiplier["value"] = 10
66
+
67
+ with pypelite.pipeline(tmp_path):
68
+ assert score("a", 3) == 30
69
+ assert score("b", 4) == 8
70
+
71
+
72
+ def test_keyed_stage_reuses_saved_result_by_key(tmp_path):
73
+ multiplier = {"value": 2}
74
+
75
+ @pypelite.stage("score", key="case_id")
76
+ def score(case_id, value):
77
+ return value * multiplier["value"]
78
+
79
+ with pypelite.pipeline(tmp_path):
80
+ assert score("a", 3) == 6
81
+
82
+ multiplier["value"] = 10
83
+
84
+ with pypelite.pipeline(tmp_path):
85
+ assert score("a", 99) == 6
86
+ assert score("b", 4) == 40
87
+
88
+
89
+ def test_batch_stage_splits_work_into_chunks(tmp_path):
90
+ calls = []
91
+
92
+ @pypelite.stage("judge", batch="records", batch_size=2)
93
+ def judge(records):
94
+ calls.append([record["case_id"] for record in records])
95
+ return [record["case_id"].upper() for record in records]
96
+
97
+ records = [
98
+ {"case_id": "a"},
99
+ {"case_id": "b"},
100
+ {"case_id": "c"},
101
+ ]
102
+
103
+ with pypelite.pipeline(tmp_path):
104
+ assert judge(records) == ["A", "B", "C"]
105
+
106
+ assert calls == [["a", "b"], ["c"]]
107
+
108
+
109
+ def test_batch_keyed_stage_reuses_saved_items(tmp_path):
110
+ version = {"value": 1}
111
+
112
+ @pypelite.stage("judge", key="case_id", batch="records", batch_size=2)
113
+ def judge(records):
114
+ return [
115
+ f"{record['case_id'].upper()}:v{version['value']}"
116
+ for record in records
117
+ ]
118
+
119
+ with pypelite.pipeline(tmp_path):
120
+ assert judge([{"case_id": "a"}, {"case_id": "b"}]) == [
121
+ "A:v1",
122
+ "B:v1",
123
+ ]
124
+
125
+ version["value"] = 2
126
+
127
+ with pypelite.pipeline(tmp_path):
128
+ assert judge([{"case_id": "b"}, {"case_id": "c"}]) == [
129
+ "B:v1",
130
+ "C:v2",
131
+ ]
132
+
133
+
134
+ def test_stage_can_use_shared_archive(tmp_path):
135
+ version = {"value": 1}
136
+ shared_archive = tmp_path / "shared"
137
+
138
+ @pypelite.stage("prices", archive="shared")
139
+ def prices(symbol):
140
+ return f"{symbol}:v{version['value']}"
141
+
142
+ with pypelite.pipeline(
143
+ archives={"default": tmp_path / "run-a", "shared": shared_archive}
144
+ ):
145
+ assert prices("ABC") == "ABC:v1"
146
+
147
+ version["value"] = 2
148
+
149
+ with pypelite.pipeline(
150
+ archives={"default": tmp_path / "run-b", "shared": shared_archive}
151
+ ):
152
+ assert prices("ABC") == "ABC:v1"