dagabaaz 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. dagabaaz-0.0.1/.claude/commands/release.md +25 -0
  2. dagabaaz-0.0.1/.github/workflows/ci.yml +23 -0
  3. dagabaaz-0.0.1/.github/workflows/publish.yml +33 -0
  4. dagabaaz-0.0.1/.gitignore +7 -0
  5. dagabaaz-0.0.1/LICENSE +21 -0
  6. dagabaaz-0.0.1/PKG-INFO +305 -0
  7. dagabaaz-0.0.1/README.md +290 -0
  8. dagabaaz-0.0.1/pyproject.toml +37 -0
  9. dagabaaz-0.0.1/src/dagabaaz/__init__.py +29 -0
  10. dagabaaz-0.0.1/src/dagabaaz/bindings.py +181 -0
  11. dagabaaz-0.0.1/src/dagabaaz/constants.py +175 -0
  12. dagabaaz-0.0.1/src/dagabaaz/expressions.py +436 -0
  13. dagabaaz-0.0.1/src/dagabaaz/filter.py +483 -0
  14. dagabaaz-0.0.1/src/dagabaaz/graph.py +253 -0
  15. dagabaaz-0.0.1/src/dagabaaz/models.py +230 -0
  16. dagabaaz-0.0.1/src/dagabaaz/orchestrator.py +529 -0
  17. dagabaaz-0.0.1/src/dagabaaz/pipes.py +307 -0
  18. dagabaaz-0.0.1/src/dagabaaz/plugins.py +51 -0
  19. dagabaaz-0.0.1/src/dagabaaz/py.typed +0 -0
  20. dagabaaz-0.0.1/src/dagabaaz/retry.py +133 -0
  21. dagabaaz-0.0.1/src/dagabaaz/schema.py +188 -0
  22. dagabaaz-0.0.1/src/dagabaaz/store.py +417 -0
  23. dagabaaz-0.0.1/src/dagabaaz/task_input.py +281 -0
  24. dagabaaz-0.0.1/src/dagabaaz/topology.py +126 -0
  25. dagabaaz-0.0.1/tests/__init__.py +0 -0
  26. dagabaaz-0.0.1/tests/expression_fixtures.json +16 -0
  27. dagabaaz-0.0.1/tests/test_artifact_filter.py +481 -0
  28. dagabaaz-0.0.1/tests/test_dag_expressions.py +75 -0
  29. dagabaaz-0.0.1/tests/test_dag_graph.py +192 -0
  30. dagabaaz-0.0.1/tests/test_dag_inputs.py +305 -0
  31. dagabaaz-0.0.1/tests/test_dag_retry.py +77 -0
  32. dagabaaz-0.0.1/tests/test_dag_schema_merge.py +101 -0
  33. dagabaaz-0.0.1/tests/test_expressions.py +799 -0
  34. dagabaaz-0.0.1/uv.lock +272 -0
@@ -0,0 +1,25 @@
1
+ ---
2
+ description: Bump version, commit, tag, and push to trigger PyPI publish
3
+ allowed-tools: Bash(git *), Bash(uv *), Bash(grep *), Read, Edit
4
+ ---
5
+
6
+ Release version $ARGUMENTS to PyPI.
7
+
8
+ Pre-flight checks (stop and report if any fail):
9
+ 1. Verify working tree is clean: `git status --short` must produce no output
10
+ 2. Verify current branch is main: `git branch --show-current` must output `main`
11
+ 3. Verify local main is up to date: `git fetch origin main && git diff --quiet main origin/main` -- if it fails, tell user to pull first
12
+ 4. Verify tag doesn't exist: `git tag -l "v$ARGUMENTS"` must produce no output
13
+ 5. Verify `$ARGUMENTS` looks like a valid semver (e.g. 0.2.0, 1.0.0)
14
+ 6. Verify `pyproject.toml` has a static `version` field under `[project]`, not listed in `dynamic`
15
+
16
+ Steps:
17
+ 1. Update the `version` field in `pyproject.toml` to `$ARGUMENTS`
18
+ 2. Run `uv run pytest` to verify tests pass -- if they fail, run `git checkout pyproject.toml` to revert and stop
19
+ 3. Run `uv lock` to update the lockfile
20
+ 4. Stage only `pyproject.toml` and `uv.lock`: `git add pyproject.toml uv.lock`
21
+ 5. Commit with message `release: v$ARGUMENTS`
22
+ 6. Create annotated git tag: `git tag -a "v$ARGUMENTS" -m "Release v$ARGUMENTS"`
23
+ 7. Push commit and tag separately: `git push origin main && git push origin "v$ARGUMENTS"`
24
+
25
+ If tests fail, revert pyproject.toml with `git checkout pyproject.toml` and stop. Do not commit or tag.
@@ -0,0 +1,23 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v7
14
+ - run: uv sync --all-extras
15
+ - run: uv run pytest
16
+
17
+ lint:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: astral-sh/setup-uv@v7
22
+ - run: uv sync --extra dev
23
+ - run: uv run ruff check src/ tests/
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v7
14
+ - run: uv sync --all-extras
15
+ - run: uv run pytest
16
+
17
+ publish:
18
+ needs: test
19
+ runs-on: ubuntu-latest
20
+ environment:
21
+ name: pypi
22
+ url: https://pypi.org/p/dagabaaz
23
+ permissions:
24
+ id-token: write
25
+ contents: read
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ with:
29
+ fetch-depth: 0
30
+ persist-credentials: false
31
+ - uses: astral-sh/setup-uv@v7
32
+ - run: uv build
33
+ - run: uv publish --trusted-publishing always
@@ -0,0 +1,7 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ dist/
5
+ *.egg-info/
6
+ .pytest_cache/
7
+ .ruff_cache/
dagabaaz-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Varun Chopra
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,305 @@
1
+ Metadata-Version: 2.4
2
+ Name: dagabaaz
3
+ Version: 0.0.1
4
+ Summary: A DAG execution engine for pipeline orchestration.
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pydantic>=2.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=8.0; extra == 'dev'
11
+ Requires-Dist: ruff; extra == 'dev'
12
+ Provides-Extra: re2
13
+ Requires-Dist: google-re2>=1.0; extra == 're2'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # dagabaaz
17
+
18
+ A Python library that runs multi-step workflows as directed acyclic graphs. You define the steps and their dependencies. The engine figures out what to run next, routes data between steps, and handles failures.
19
+
20
+ ```
21
+ pip install dagabaaz
22
+ ```
23
+
24
+ Requires Python 3.12+. Optional: `google-re2` for ReDoS-safe regex in pipe expressions.
25
+
26
+ ## Why This Exists
27
+
28
+ Most DAG engines (Airflow, Prefect, Dagster) are platforms. They own the scheduler, the database, the UI, and the execution runtime. If you're building a product where pipelines are a *feature* rather than the whole product, you don't want a platform. You want a library you call from your own code.
29
+
30
+ Persistence and dispatch are behind a `Protocol`. Bring your own database and queue.
31
+
32
+ ## How It Works
33
+
34
+ Each step in a pipeline is called a **node**. Nodes produce **artifacts** (files or data records) that flow to downstream nodes. A single execution of a pipeline is called a **run**.
35
+
36
+ When a run starts:
37
+
38
+ 1. Root nodes (no dependencies) get tasks dispatched.
39
+ 2. Your workers execute tasks. Each task produces artifacts.
40
+ 3. When all tasks at a node finish, the engine finds downstream nodes whose dependencies are now satisfied.
41
+ 4. For each ready node, artifacts are collected from upstream, optional edge filters are applied, and new tasks are dispatched.
42
+ 5. This repeats until every node is done or a failure stops the run.
43
+
44
+ The engine does not execute tasks itself. Your `DagStore` implementation handles persistence and queue operations. The engine calls methods on it to dispatch work, check progress, and record outcomes.
45
+
46
+ ## Quick Start
47
+
48
+ ### 1. Define a pipeline
49
+
50
+ A pipeline is a list of `DagNode` objects. Each node has a slug (unique ID), a plugin name, and optional dependencies.
51
+
52
+ ```python
53
+ from dagabaaz.models import DagNode
54
+ from dagabaaz.constants import FanMode
55
+
56
+ nodes = [
57
+ DagNode(slug="source", plugin="fetch"),
58
+ DagNode(slug="process", plugin="transform", depends_on=["source"]),
59
+ DagNode(
60
+ slug="export",
61
+ plugin="export",
62
+ depends_on=["process"],
63
+ fan_mode=FanMode.AGGREGATE,
64
+ ),
65
+ ]
66
+ ```
67
+
68
+ ### 2. Implement `DagStore`
69
+
70
+ The engine talks to your infrastructure through the `DagStore` protocol. It has 20 methods covering task dispatch, state queries, and lifecycle transitions. See `store.py` for the full protocol.
71
+
72
+ Here are the three most important ones:
73
+
74
+ ```python
75
+ class MyStore:
76
+ def get_barrier_state(self, run_id, node_index):
77
+ # Return (run_status, total_tasks, completed_tasks)
78
+ ...
79
+
80
+ def try_claim_node_launch(self, run_id, node_index) -> bool:
81
+ # Returns True if this call claimed the node
82
+ ...
83
+
84
+ def dispatch_task(self, run_id, node_index, plugin_name, input_artifact_id) -> str:
85
+ # Create task record, push to your job queue, return task_id
86
+ ...
87
+ ```
88
+
89
+ ### 3. Start a run
90
+
91
+ ```python
92
+ from dagabaaz.orchestrator import start_run
93
+
94
+ root_indices = start_run(store, run_id="run-1", nodes=nodes)
95
+ # The engine found root nodes and called store.dispatch_task for each one.
96
+ ```
97
+
98
+ ### 4. Handle task completion
99
+
100
+ After your worker executes a task, call back into the engine so it can dispatch the next steps:
101
+
102
+ ```python
103
+ from dagabaaz.orchestrator import on_task_complete, OrchestratorCallbacks
104
+
105
+ callbacks = OrchestratorCallbacks(
106
+ on_run_completed=lambda run_id: print(f"Run {run_id} done"),
107
+ on_run_failed=lambda run_id: print(f"Run {run_id} failed"),
108
+ on_run_crashed=lambda run_id: print(f"Run {run_id} crashed"),
109
+ )
110
+
111
+ on_task_complete(
112
+ store,
113
+ task_id="task-1",
114
+ callbacks=callbacks,
115
+ # No routing nodes in this example. See Terminology for passthrough.
116
+ resolve_passthrough=lambda plugin: False,
117
+ )
118
+ ```
119
+
120
+ The engine checks if the completed task's node is fully done, finds which downstream nodes are now ready, and dispatches them.
121
+
122
+ ## Pipeline Patterns
123
+
124
+ ### Linear
125
+
126
+ Three steps in sequence. Each runs after its dependency completes.
127
+
128
+ ```python
129
+ nodes = [
130
+ DagNode(slug="fetch", plugin="fetch"),
131
+ DagNode(slug="transform", plugin="transform", depends_on=["fetch"]),
132
+ DagNode(slug="export", plugin="export", depends_on=["transform"]),
133
+ ]
134
+ ```
135
+
136
+ ### Fan-out / scatter-gather
137
+
138
+ A source produces multiple files. Two branches process each file in parallel. The merge node collects results that came from the same original file. If the source produced 10 files and there are 2 branches, the merge node gets 10 tasks, each with 2 results.
139
+
140
+ ```python
141
+ nodes = [
142
+ DagNode(slug="source", plugin="fetch"),
143
+ DagNode(slug="branch_a", plugin="process_a", depends_on=["source"]),
144
+ DagNode(slug="branch_b", plugin="process_b", depends_on=["source"]),
145
+ DagNode(
146
+ slug="merge",
147
+ plugin="merge",
148
+ depends_on=["branch_a", "branch_b"],
149
+ fan_mode=FanMode.GROUPED,
150
+ ),
151
+ ]
152
+ ```
153
+
154
+ ### Conditional routing with edge filters
155
+
156
+ Edge filters route artifacts to different branches by type. Video files go to one branch, subtitles to another.
157
+
158
+ ```python
159
+ from dagabaaz.models import DagNode, EdgeFilter, FilterRule
160
+ from dagabaaz.constants import FanMode, FilterOperator
161
+
162
+ nodes = [
163
+ DagNode(slug="source", plugin="fetch"),
164
+ DagNode(
165
+ slug="video",
166
+ plugin="transcode",
167
+ depends_on=["source"],
168
+ fan_mode=FanMode.AGGREGATE,
169
+ edge_filters={
170
+ "source": EdgeFilter(
171
+ rules=[FilterRule(field="file_type", operator=FilterOperator.EQ, value="video")]
172
+ )
173
+ },
174
+ ),
175
+ DagNode(
176
+ slug="subtitle",
177
+ plugin="parse_subs",
178
+ depends_on=["source"],
179
+ edge_filters={
180
+ "source": EdgeFilter(
181
+ rules=[FilterRule(field="file_type", operator=FilterOperator.EQ, value="subtitle")]
182
+ )
183
+ },
184
+ ),
185
+ ]
186
+ ```
187
+
188
+ When `source` produces a mix of `.mp4` and `.srt` files, the engine routes each type to the correct branch. If a branch receives no artifacts (e.g., no subtitles), it is marked `filtered` and does not block downstream nodes.
189
+
190
+ ## Integrating with a Job Queue
191
+
192
+ The engine dispatches work but does not execute it. Your `DagStore.dispatch_task` pushes a job to whatever queue you use. When a worker finishes, it calls back into the engine.
193
+
194
+ **Dispatch.** The engine calls `store.dispatch_task(...)`. Your implementation inserts a task row, pushes a job to the queue with the task ID as payload, and returns the task ID. Skipped, filtered, and failed tasks don't go through the queue. They get pre-terminal rows inserted directly.
195
+
196
+ **Consume.** A worker pulls a job from the queue, extracts the task ID, runs the plugin, and produces artifacts. Use `build_task_input` to assemble the data the plugin needs:
197
+
198
+ ```python
199
+ from dagabaaz.task_input import build_task_input
200
+
201
+ input_data = build_task_input(
202
+ store,
203
+ run_id="run-1",
204
+ node_index=1,
205
+ input_artifact_id="artifact-xyz",
206
+ nodes=nodes,
207
+ )
208
+ ```
209
+
210
+ **Complete.** The worker calls `on_task_complete`. The engine checks whether the node is fully done, finds which downstream nodes are now ready, and dispatches them.
211
+
212
+ On failure, `on_task_failed` marks the run as failed and cancels remaining tasks. On infrastructure crash, `on_task_crashed` does the same with a different terminal status.
213
+
214
+ ### Concurrency
215
+
216
+ The engine assumes `on_task_complete` is called one at a time per run.
217
+
218
+ With Postgres, use `SELECT pg_advisory_xact_lock(hashtext(run_id))` before calling the engine. The engine's `try_claim_node_launch` (INSERT ON CONFLICT DO NOTHING) prevents double-launches if two workers race. `try_claim_run_terminal` (UPDATE WHERE status NOT IN terminal_set) ensures exactly one caller wins a state transition.
219
+
220
+ With Redis, use a distributed lock per run ID. With a single worker, no locking is needed.
221
+
222
+ ### Crash recovery
223
+
224
+ If a worker dies mid-task, the queue's visibility timeout returns the job to pending. The next worker re-runs the task and re-enters the engine. Idempotent claims prevent double-launches.
225
+
226
+ If your queue is Postgres-native (e.g., [postkit](https://github.com/varunchopra/postkit), graphile-worker, or pgboss), the task row insert and queue push happen in the same transaction, so dispatch is atomic.
227
+
228
+ ## Terminology
229
+
230
+ **Node**: A single step in a pipeline. Each node wraps a plugin and declares which other nodes it depends on.
231
+
232
+ **Artifact**: A piece of data produced by a task. Usually a file, but can be a data record. Artifacts flow between nodes.
233
+
234
+ **Task**: One concrete execution of a node. A single node can spawn multiple tasks depending on its fan mode.
235
+
236
+ **Run**: One execution of a pipeline. Contains all tasks across all nodes. Has its own lifecycle (running, completed, failed, etc.).
237
+
238
+ **Fan mode**: Controls how a node receives artifacts from upstream. Single (one task per artifact), aggregate (one task gets all artifacts), or grouped (one task per group of related artifacts). Set on the node, not the plugin.
239
+
240
+ **Origin artifact**: Every artifact remembers which root artifact started its processing chain. This is how grouped mode knows which artifacts belong together.
241
+
242
+ **Edge filter**: Rules on an edge that decide which artifacts pass through. All rules must match (AND logic). Can filter by file type, extension, size, name, or metadata fields.
243
+
244
+ **Input binding**: How a task input field gets its value. Four sources: upstream artifact field, literal config value, user-provided run input, or an expression template with transforms.
245
+
246
+ **Barrier sync**: A node runs only when all of its dependencies have all of their tasks finished. If any task fails, the run fails.
247
+
248
+ **Skipped**: A node is skipped when its upstream is dead. Cascades: if B is skipped, everything downstream of B is also skipped.
249
+
250
+ **Filtered**: A node is filtered when it has no artifacts to work with. Does not cascade. Downstream nodes still try to collect artifacts.
251
+
252
+ **Passthrough**: When a node has no artifacts, the engine can look further upstream to find some. But it only looks through routing nodes (like gates). If a processing node has no output, the engine treats that as intentional and stops looking.
253
+
254
+ **Terminal state**: A final state with no further transitions. Tasks: `completed`, `failed`, `crashed`, `cancelled`, `skipped`, `filtered`. Runs: `completed`, `failed`, `crashed`, `cancelled`.
255
+
256
+ ## Expression Language
257
+
258
+ Input bindings can use `{namespace.key | pipe}` expressions:
259
+
260
+ ```python
261
+ "{source.file_path}" # artifact field
262
+ "{source.title | upper | truncate(50)}" # with transforms
263
+ "{list(branch_a.url, branch_b.url) | join(,)}" # multiple sources
264
+ "{input.api_url | required}" # run input
265
+ "{config.output_format | default(mp4)}" # config value
266
+ ```
267
+
268
+ 30 built-in pipes: `upper`, `lower`, `trim`, `title`, `replace`, `strip`, `lstrip`, `rstrip`, `default`, `required`, `first`, `last`, `nth`, `join`, `basename`, `dirname`, `stem`, `ext`, `urlencode`, `urldecode`, `int`, `string`, `truncate`, `prepend`, `append`, `match`, `json_get`, `flatten`, `compact`, `pad`.
269
+
270
+ Expressions are validated at pipeline save time and evaluated at task execution time. No `eval()`.
271
+
272
+ ## Module Map
273
+
274
+ | Module | Purpose |
275
+ |--------|---------|
276
+ | `orchestrator` | Decides which nodes to run next after a task completes |
277
+ | `topology` | Caches dependency graphs so they aren't rebuilt on every task completion |
278
+ | `graph` | Graph algorithms: which nodes are ready, collecting artifacts from upstream |
279
+ | `filter` | Evaluates edge filter rules and groups artifacts by origin |
280
+ | `models` | Data types: `DagNode`, `DagArtifact`, `EdgeFilter`, binding sources |
281
+ | `constants` | Enums and status sets (`RunStatus`, `TaskStatus`, `FanMode`) |
282
+ | `store` | `DagStore` and `TaskInputStore` protocols |
283
+ | `bindings` | Resolves how each task input field gets its value |
284
+ | `expressions` | Parses and evaluates `{slug.key \| pipe}` templates |
285
+ | `pipes` | 30 built-in transform functions for expressions |
286
+ | `task_input` | Assembles the input data dict passed to plugins on the worker side |
287
+ | `schema` | Generates the input form schema for a pipeline |
288
+ | `retry` | Computes which nodes to re-run after a failure |
289
+ | `plugins` | `PluginMeta` protocol for plugin metadata |
290
+
291
+ ## Design Decisions
292
+
293
+ **Protocol, not base class.** `DagStore` is a `typing.Protocol`, not a base class. Your store implementation just needs the right method signatures.
294
+
295
+ **Fan mode belongs to the node, not the plugin.** The pipeline author decides how a node aggregates its inputs, not the plugin author. Plugins can suggest a default, but the node definition is what the engine uses.
296
+
297
+ **Filtered != skipped.** If a node has no artifacts to work with, it's marked `filtered` and downstream nodes still run. If a node's upstream is dead, it's marked `skipped` and everything downstream is also skipped. Mixing these up causes either false cascades or tasks running on missing data.
298
+
299
+ **Passthrough-aware artifact lookup.** When a node's immediate dependency has no artifacts, the engine walks up the dependency chain looking for the nearest ancestor that does. But it only walks through routing nodes. Processing nodes are barriers.
300
+
301
+ **No barrel exports.** Import from submodules directly: `from dagabaaz.models import DagNode`. The module path tells you where each symbol lives.
302
+
303
+ ## License
304
+
305
+ MIT