daglite 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daglite-0.3.0/PKG-INFO +406 -0
- daglite-0.3.0/README.md +395 -0
- daglite-0.3.0/pyproject.toml +183 -0
- daglite-0.3.0/setup.cfg +4 -0
- daglite-0.3.0/src/daglite/__init__.py +70 -0
- daglite-0.3.0/src/daglite/backends/__init__.py +36 -0
- daglite-0.3.0/src/daglite/backends/base.py +33 -0
- daglite-0.3.0/src/daglite/backends/distributed/__init__.py +0 -0
- daglite-0.3.0/src/daglite/backends/local.py +178 -0
- daglite-0.3.0/src/daglite/engine.py +624 -0
- daglite-0.3.0/src/daglite/exceptions.py +65 -0
- daglite-0.3.0/src/daglite/futures.py +467 -0
- daglite-0.3.0/src/daglite/graph/__init__.py +0 -0
- daglite-0.3.0/src/daglite/graph/base.py +206 -0
- daglite-0.3.0/src/daglite/graph/builder.py +53 -0
- daglite-0.3.0/src/daglite/graph/nodes.py +131 -0
- daglite-0.3.0/src/daglite/hooks/__init__.py +13 -0
- daglite-0.3.0/src/daglite/hooks/manager.py +106 -0
- daglite-0.3.0/src/daglite/hooks/markers.py +12 -0
- daglite-0.3.0/src/daglite/hooks/specs.py +106 -0
- daglite-0.3.0/src/daglite/pipelines.py +191 -0
- daglite-0.3.0/src/daglite/py.typed +0 -0
- daglite-0.3.0/src/daglite/settings.py +64 -0
- daglite-0.3.0/src/daglite/tasks.py +432 -0
- daglite-0.3.0/src/daglite.egg-info/PKG-INFO +406 -0
- daglite-0.3.0/src/daglite.egg-info/SOURCES.txt +31 -0
- daglite-0.3.0/src/daglite.egg-info/dependency_links.txt +1 -0
- daglite-0.3.0/src/daglite.egg-info/requires.txt +5 -0
- daglite-0.3.0/src/daglite.egg-info/top_level.txt +1 -0
- daglite-0.3.0/tests/test_graph.py +579 -0
- daglite-0.3.0/tests/test_pipeline.py +246 -0
- daglite-0.3.0/tests/test_settings.py +82 -0
- daglite-0.3.0/tests/test_tasks_futures.py +584 -0
daglite-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: daglite
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Lightweight Python framework for building static DAGs with explicit bindings.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pluggy>=1.6.0
|
|
8
|
+
Requires-Dist: typing-extensions>=4.15.0
|
|
9
|
+
Provides-Extra: cli
|
|
10
|
+
Requires-Dist: daglite-cli; extra == "cli"
|
|
11
|
+
|
|
12
|
+
# Daglite
|
|
13
|
+
|
|
14
|
+
[](https://pypi.org/project/daglite/)
|
|
15
|
+
[](https://www.python.org/)
|
|
16
|
+
[](http://mypy-lang.org/)
|
|
17
|
+
[](https://github.com/microsoft/pyright)
|
|
18
|
+
[](https://github.com/astral-sh/ruff)
|
|
19
|
+
[](https://github.com/cswartzvi/daglite/actions/workflows/testing.yaml)
|
|
20
|
+
[](https://codecov.io/github/cswartzvi/daglite)
|
|
21
|
+
|
|
22
|
+
A lightweight, type-safe Python framework for building and executing DAGs (Directed Acyclic Graphs) with explicit data flow and composable operations.
|
|
23
|
+
|
|
24
|
+
**[📚 Documentation](https://cswartzvi.github.io/daglite/)** | **[🚀 Getting Started](#quick-start)** | **[💡 Examples](#examples)**
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## ✨ Key Features
|
|
29
|
+
|
|
30
|
+
- **🎯 Explicit & Type-Safe**: Complete type checking support with `mypy`, `pyright`, and `pyrefly`
|
|
31
|
+
- **🔗 Fluent API**: Chain operations naturally with `.then()`, `.map()`, `.join()`
|
|
32
|
+
- **📦 Zero Dependencies**: Core library has no external dependencies
|
|
33
|
+
- **⚡ Async Support**: Built-in async execution with threading/multiprocessing
|
|
34
|
+
- **🧩 Composable**: Mix and match patterns - sequential, fan-out, map-reduce
|
|
35
|
+
- **🔍 Testable**: Pure functions make DAGs easy to test and debug
|
|
36
|
+
- **📋 CLI Support**: Define pipelines and run them from the command line
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## 🎬 Quick Start
|
|
41
|
+
|
|
42
|
+
### Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install daglite
|
|
46
|
+
|
|
47
|
+
# With CLI support
|
|
48
|
+
pip install daglite[cli]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Basic Example
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from daglite import task, evaluate
|
|
55
|
+
|
|
56
|
+
@task
|
|
57
|
+
def fetch_data(url: str) -> dict:
|
|
58
|
+
"""Fetch data from an API."""
|
|
59
|
+
return {"url": url, "data": [...]}
|
|
60
|
+
|
|
61
|
+
@task
|
|
62
|
+
def process(data: dict) -> list:
|
|
63
|
+
"""Process the fetched data."""
|
|
64
|
+
return [item.upper() for item in data["data"]]
|
|
65
|
+
|
|
66
|
+
@task
|
|
67
|
+
def save(items: list, path: str) -> None:
|
|
68
|
+
"""Save results to a file."""
|
|
69
|
+
with open(path, "w") as f:
|
|
70
|
+
f.write("\n".join(items))
|
|
71
|
+
|
|
72
|
+
# Build the DAG
|
|
73
|
+
fetched = fetch_data.bind(url="https://api.example.com")
|
|
74
|
+
processed = process.bind(data=fetched)
|
|
75
|
+
saved = save.bind(items=processed, path="output.txt")
|
|
76
|
+
|
|
77
|
+
# Execute the DAG
|
|
78
|
+
evaluate(saved)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## 🌟 The Fluent API
|
|
84
|
+
|
|
85
|
+
Daglite provides two ways to compose tasks: **explicit** (shown above) and **fluent**:
|
|
86
|
+
|
|
87
|
+
### Fluent Chaining
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from daglite import task, evaluate
|
|
91
|
+
|
|
92
|
+
@task
|
|
93
|
+
def fetch(url: str) -> str:
|
|
94
|
+
return requests.get(url).text
|
|
95
|
+
|
|
96
|
+
@task
|
|
97
|
+
def parse(html: str, selector: str) -> dict:
|
|
98
|
+
return extract(html, selector)
|
|
99
|
+
|
|
100
|
+
@task
|
|
101
|
+
def transform(data: dict, format: str) -> str:
|
|
102
|
+
return convert(data, format)
|
|
103
|
+
|
|
104
|
+
# Fluent style - chain operations naturally
|
|
105
|
+
result = evaluate(
|
|
106
|
+
fetch.bind(url="https://example.com")
|
|
107
|
+
.then(parse, selector=".content")
|
|
108
|
+
.then(transform, format="json")
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Map-Reduce Patterns
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
@task
|
|
116
|
+
def square(x: int) -> int:
|
|
117
|
+
return x ** 2
|
|
118
|
+
|
|
119
|
+
@task
|
|
120
|
+
def sum_all(values: list[int]) -> int:
|
|
121
|
+
return sum(values)
|
|
122
|
+
|
|
123
|
+
# Fan-out with .product() (Cartesian product)
|
|
124
|
+
result = evaluate(
|
|
125
|
+
square.product(x=[1, 2, 3, 4])
|
|
126
|
+
.map(lambda x: x * 2) # Transform each
|
|
127
|
+
.join(sum_all) # Reduce to single value
|
|
128
|
+
)
|
|
129
|
+
# Result: 60 = (2 + 8 + 18 + 32)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Pairwise Operations
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
@task
|
|
136
|
+
def multiply(x: int, y: int) -> int:
|
|
137
|
+
return x * y
|
|
138
|
+
|
|
139
|
+
# Zip sequences element-wise
|
|
140
|
+
numbers = multiply.zip(x=[1, 2, 3], y=[10, 20, 30])
|
|
141
|
+
result = evaluate(numbers)
|
|
142
|
+
# Result: [10, 40, 90]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## 💡 Examples
|
|
148
|
+
|
|
149
|
+
### Sequential Pipeline
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
@task
|
|
153
|
+
def load_config(path: str) -> dict:
|
|
154
|
+
return json.load(open(path))
|
|
155
|
+
|
|
156
|
+
@task
|
|
157
|
+
def init_model(config: dict) -> Model:
|
|
158
|
+
return Model(**config)
|
|
159
|
+
|
|
160
|
+
@task
|
|
161
|
+
def train(model: Model, data: pd.DataFrame) -> Model:
|
|
162
|
+
model.fit(data)
|
|
163
|
+
return model
|
|
164
|
+
|
|
165
|
+
# Build pipeline
|
|
166
|
+
model = (
|
|
167
|
+
load_config.bind(path="config.json")
|
|
168
|
+
.then(init_model)
|
|
169
|
+
.then(train, data=training_data)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
result = evaluate(model)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Parallel Fan-Out
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
@task
|
|
179
|
+
def fetch_user(user_id: int) -> dict:
|
|
180
|
+
return api.get(f"/users/{user_id}")
|
|
181
|
+
|
|
182
|
+
@task
|
|
183
|
+
def enrich(user: dict, with_avatar: bool) -> dict:
|
|
184
|
+
if with_avatar:
|
|
185
|
+
user["avatar"] = fetch_avatar(user["id"])
|
|
186
|
+
return user
|
|
187
|
+
|
|
188
|
+
@task
|
|
189
|
+
def save_all(users: list[dict]) -> None:
|
|
190
|
+
db.bulk_insert(users)
|
|
191
|
+
|
|
192
|
+
# Process multiple users in parallel
|
|
193
|
+
result = evaluate(
|
|
194
|
+
fetch_user.product(user_id=[1, 2, 3, 4, 5])
|
|
195
|
+
.map(enrich, with_avatar=True)
|
|
196
|
+
.join(save_all)
|
|
197
|
+
)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Complex DAG
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
@task
|
|
204
|
+
def fetch_prices(symbols: list[str]) -> pd.DataFrame:
|
|
205
|
+
return yfinance.download(symbols)
|
|
206
|
+
|
|
207
|
+
@task
|
|
208
|
+
def calculate_returns(prices: pd.DataFrame, window: int) -> pd.DataFrame:
|
|
209
|
+
return prices.pct_change(window)
|
|
210
|
+
|
|
211
|
+
@task
|
|
212
|
+
def compute_correlation(returns: pd.DataFrame) -> pd.DataFrame:
|
|
213
|
+
return returns.corr()
|
|
214
|
+
|
|
215
|
+
@task
|
|
216
|
+
def find_pairs(corr: pd.DataFrame, threshold: float) -> list[tuple]:
|
|
217
|
+
return [(i, j) for i, j in high_correlation_pairs(corr, threshold)]
|
|
218
|
+
|
|
219
|
+
# Build analytics pipeline
|
|
220
|
+
result = evaluate(
|
|
221
|
+
fetch_prices.bind(symbols=["AAPL", "GOOGL", "MSFT"])
|
|
222
|
+
.then(calculate_returns, window=20)
|
|
223
|
+
.then(compute_correlation)
|
|
224
|
+
.then(find_pairs, threshold=0.8)
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## 🎯 Why Daglite?
|
|
231
|
+
|
|
232
|
+
Modern data workflows need structure, but most DAG frameworks are overkill. Airflow requires Docker and Kubernetes. Prefect needs a server. Dagster has dozens of dependencies. What if you just want to build a clean, type-safe pipeline that runs locally?
|
|
233
|
+
|
|
234
|
+
**Daglite fills this gap.**
|
|
235
|
+
|
|
236
|
+
### The Problem
|
|
237
|
+
|
|
238
|
+
Building data pipelines typically involves:
|
|
239
|
+
- **Manual dependency tracking** - Functions scattered across files, implicit data flow
|
|
240
|
+
- **No type safety** - Runtime errors from type mismatches, no IDE help
|
|
241
|
+
- **Heavy infrastructure** - Can't run without databases, containers, or cloud services
|
|
242
|
+
- **Complex APIs** - Steep learning curve just to chain a few functions
|
|
243
|
+
|
|
244
|
+
### The Solution
|
|
245
|
+
|
|
246
|
+
Daglite provides:
|
|
247
|
+
- **Explicit dependencies** - Clear data flow graph visible in your code
|
|
248
|
+
- **Complete type checking** - Catch errors before runtime, autocomplete everywhere
|
|
249
|
+
- **Zero infrastructure** - Pure Python, runs anywhere Python runs
|
|
250
|
+
- **Simple API** - If you know Python functions, you know Daglite
|
|
251
|
+
|
|
252
|
+
### When to Use Daglite
|
|
253
|
+
|
|
254
|
+
✅ **Perfect for:**
|
|
255
|
+
- ETL scripts and data transformations
|
|
256
|
+
- ML pipelines (feature engineering, training, evaluation)
|
|
257
|
+
- CLI tools that need workflow orchestration
|
|
258
|
+
- Local development and testing
|
|
259
|
+
- Air-gapped or restricted environments
|
|
260
|
+
- Projects where you want type safety and simplicity
|
|
261
|
+
|
|
262
|
+
❌ **Not ideal for:**
|
|
263
|
+
- Production job scheduling (use Airflow, Prefect)
|
|
264
|
+
- Real-time streaming (use Kafka, Flink)
|
|
265
|
+
- Distributed computing at scale (use Spark, Dask)
|
|
266
|
+
- Multi-tenant workflow orchestration (use Dagster)
|
|
267
|
+
|
|
268
|
+
Daglite is the **lightweight alternative** - maximum clarity with minimal complexity.
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## 🏗️ Core Concepts
|
|
273
|
+
|
|
274
|
+
### Tasks
|
|
275
|
+
|
|
276
|
+
Functions decorated with `@task` become composable DAG nodes:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
@task
|
|
280
|
+
def process_data(input: str, param: int = 10) -> dict:
|
|
281
|
+
"""Tasks are just functions with explicit inputs/outputs."""
|
|
282
|
+
return {"result": input * param}
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Binding & Futures
|
|
286
|
+
|
|
287
|
+
Tasks don't execute immediately - they return futures:
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
# Create a future (lazy evaluation)
|
|
291
|
+
future = process_data.bind(input="hello", param=5)
|
|
292
|
+
|
|
293
|
+
# Execute when ready
|
|
294
|
+
result = evaluate(future) # Returns {"result": "hellohellohellohellohello"}
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### Composition Patterns
|
|
298
|
+
|
|
299
|
+
| Pattern | Method | Use Case |
|
|
300
|
+
|---------|--------|----------|
|
|
301
|
+
| Sequential | `.bind()` + `.then()` | Chain dependent operations |
|
|
302
|
+
| Cartesian | `.product()` | Parameter sweeps, all combinations |
|
|
303
|
+
| Pairwise | `.zip()` | Element-wise operations |
|
|
304
|
+
| Transform | `.map()` | Apply function to each element |
|
|
305
|
+
| Reduce | `.join()` | Aggregate sequence to single value |
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## 🔧 Advanced Features
|
|
310
|
+
|
|
311
|
+
### Async Execution
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
# Run DAG with threading backend
|
|
315
|
+
result = evaluate(my_dag, use_async=True)
|
|
316
|
+
|
|
317
|
+
# Custom backends
|
|
318
|
+
@task(backend="threading")
|
|
319
|
+
def io_bound_task(url: str) -> str:
|
|
320
|
+
return requests.get(url).text
|
|
321
|
+
|
|
322
|
+
@task(backend="multiprocessing")
|
|
323
|
+
def cpu_bound_task(data: np.ndarray) -> np.ndarray:
|
|
324
|
+
return expensive_computation(data)
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Fixed Parameters
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
# Partially apply parameters
|
|
331
|
+
normalize = scale.fix(factor=100, offset=10)
|
|
332
|
+
|
|
333
|
+
# Use in different contexts
|
|
334
|
+
result1 = normalize.bind(x=5) # Single value
|
|
335
|
+
result2 = normalize.product(x=[1,2,3]) # Multiple values
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### CLI Pipelines
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from daglite import pipeline
|
|
342
|
+
|
|
343
|
+
@pipeline
|
|
344
|
+
def ml_pipeline(model_path: str, data_path: str, epochs: int = 10):
|
|
345
|
+
"""Train a machine learning model."""
|
|
346
|
+
data = load_data.bind(path=data_path)
|
|
347
|
+
model = train_model.bind(data=data, epochs=epochs)
|
|
348
|
+
return save_model.bind(model=model, path=model_path)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
Run from command line:
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
daglite run ml_pipeline --model-path model.pkl --data-path data.csv --epochs 20
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## 📊 Comparison
|
|
360
|
+
|
|
361
|
+
| Feature | Daglite | Airflow | Prefect | Dagster |
|
|
362
|
+
|---------|---------|---------|---------|---------|
|
|
363
|
+
| **Lightweight** | ✅ 0 deps | ❌ Heavy | ❌ Heavy | ❌ Heavy |
|
|
364
|
+
| **Type Safety** | ✅ Full | ⚠️ Partial | ⚠️ Partial | ✅ Full |
|
|
365
|
+
| **Pure Python** | ✅ Yes | ❌ YAML/Config | ⚠️ Decorators | ⚠️ Config |
|
|
366
|
+
| **Static DAGs** | ✅ Yes | ❌ Dynamic | ❌ Dynamic | ✅ Yes |
|
|
367
|
+
| **Fluent API** | ✅ Yes | ❌ No | ⚠️ Limited | ❌ No |
|
|
368
|
+
| **Testing** | ✅ Simple | ⚠️ Complex | ⚠️ Complex | ⚠️ Complex |
|
|
369
|
+
| **Use Case** | Local/ETL | Production Orchestration | Workflows | Data Pipelines |
|
|
370
|
+
|
|
371
|
+
---
|
|
372
|
+
|
|
373
|
+
## 📚 Documentation
|
|
374
|
+
|
|
375
|
+
Full documentation is available at **[cswartzvi.github.io/daglite](https://cswartzvi.github.io/daglite/)**
|
|
376
|
+
|
|
377
|
+
- [Getting Started Guide](https://cswartzvi.github.io/daglite/getting-started/)
|
|
378
|
+
- [User Guide](https://cswartzvi.github.io/daglite/user-guide/tasks/)
|
|
379
|
+
- [API Reference](https://cswartzvi.github.io/daglite/api/)
|
|
380
|
+
- [Examples](https://cswartzvi.github.io/daglite/examples/)
|
|
381
|
+
|
|
382
|
+
---
|
|
383
|
+
|
|
384
|
+
## 🤝 Contributing
|
|
385
|
+
|
|
386
|
+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
387
|
+
|
|
388
|
+
---
|
|
389
|
+
|
|
390
|
+
## 📄 License
|
|
391
|
+
|
|
392
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
393
|
+
|
|
394
|
+
---
|
|
395
|
+
|
|
396
|
+
## 🙏 Acknowledgments
|
|
397
|
+
|
|
398
|
+
Inspired by:
|
|
399
|
+
- [Apache Airflow](https://airflow.apache.org/) - DAG orchestration patterns
|
|
400
|
+
- [Prefect](https://www.prefect.io/) - Modern workflow design
|
|
401
|
+
- [Dask](https://dask.org/) - Lazy evaluation and graph execution
|
|
402
|
+
- [itertools](https://docs.python.org/3/library/itertools.html) - Composable operations
|
|
403
|
+
|
|
404
|
+
---
|
|
405
|
+
|
|
406
|
+
**Built with ❤️ for simplicity and type safety**
|