py-data-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engine/__init__.py +37 -0
- data_engine/application/__init__.py +39 -0
- data_engine/application/actions.py +42 -0
- data_engine/application/catalog.py +151 -0
- data_engine/application/control.py +213 -0
- data_engine/application/details.py +73 -0
- data_engine/application/runtime.py +449 -0
- data_engine/application/workspace.py +62 -0
- data_engine/authoring/__init__.py +14 -0
- data_engine/authoring/builder.py +31 -0
- data_engine/authoring/execution/__init__.py +6 -0
- data_engine/authoring/execution/app.py +6 -0
- data_engine/authoring/execution/context.py +82 -0
- data_engine/authoring/execution/continuous.py +176 -0
- data_engine/authoring/execution/grouped.py +106 -0
- data_engine/authoring/execution/logging.py +83 -0
- data_engine/authoring/execution/polling.py +135 -0
- data_engine/authoring/execution/runner.py +210 -0
- data_engine/authoring/execution/single.py +171 -0
- data_engine/authoring/flow.py +361 -0
- data_engine/authoring/helpers.py +160 -0
- data_engine/authoring/model.py +59 -0
- data_engine/authoring/primitives.py +430 -0
- data_engine/authoring/services.py +42 -0
- data_engine/devtools/__init__.py +3 -0
- data_engine/devtools/project_ast_map.py +503 -0
- data_engine/docs/__init__.py +1 -0
- data_engine/docs/sphinx_source/_static/custom.css +13 -0
- data_engine/docs/sphinx_source/api.rst +42 -0
- data_engine/docs/sphinx_source/conf.py +37 -0
- data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
- data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
- data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
- data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
- data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
- data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
- data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
- data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
- data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
- data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
- data_engine/docs/sphinx_source/guides/project-map.md +118 -0
- data_engine/docs/sphinx_source/guides/recipes.md +268 -0
- data_engine/docs/sphinx_source/index.rst +22 -0
- data_engine/domain/__init__.py +92 -0
- data_engine/domain/actions.py +69 -0
- data_engine/domain/catalog.py +128 -0
- data_engine/domain/details.py +214 -0
- data_engine/domain/diagnostics.py +56 -0
- data_engine/domain/errors.py +104 -0
- data_engine/domain/inspection.py +99 -0
- data_engine/domain/logs.py +118 -0
- data_engine/domain/operations.py +172 -0
- data_engine/domain/operator.py +72 -0
- data_engine/domain/runs.py +155 -0
- data_engine/domain/runtime.py +279 -0
- data_engine/domain/source_state.py +17 -0
- data_engine/domain/support.py +54 -0
- data_engine/domain/time.py +23 -0
- data_engine/domain/workspace.py +159 -0
- data_engine/flow_modules/__init__.py +1 -0
- data_engine/flow_modules/flow_module_compiler.py +179 -0
- data_engine/flow_modules/flow_module_loader.py +201 -0
- data_engine/helpers/__init__.py +25 -0
- data_engine/helpers/duckdb.py +705 -0
- data_engine/hosts/__init__.py +1 -0
- data_engine/hosts/daemon/__init__.py +23 -0
- data_engine/hosts/daemon/app.py +221 -0
- data_engine/hosts/daemon/bootstrap.py +69 -0
- data_engine/hosts/daemon/client.py +465 -0
- data_engine/hosts/daemon/commands.py +64 -0
- data_engine/hosts/daemon/composition.py +310 -0
- data_engine/hosts/daemon/constants.py +15 -0
- data_engine/hosts/daemon/entrypoints.py +97 -0
- data_engine/hosts/daemon/lifecycle.py +191 -0
- data_engine/hosts/daemon/manager.py +272 -0
- data_engine/hosts/daemon/ownership.py +126 -0
- data_engine/hosts/daemon/runtime_commands.py +188 -0
- data_engine/hosts/daemon/runtime_control.py +31 -0
- data_engine/hosts/daemon/server.py +84 -0
- data_engine/hosts/daemon/shared_state.py +147 -0
- data_engine/hosts/daemon/state_sync.py +101 -0
- data_engine/platform/__init__.py +1 -0
- data_engine/platform/identity.py +35 -0
- data_engine/platform/local_settings.py +146 -0
- data_engine/platform/theme.py +259 -0
- data_engine/platform/workspace_models.py +190 -0
- data_engine/platform/workspace_policy.py +333 -0
- data_engine/runtime/__init__.py +1 -0
- data_engine/runtime/file_watch.py +185 -0
- data_engine/runtime/ledger_models.py +116 -0
- data_engine/runtime/runtime_db.py +938 -0
- data_engine/runtime/shared_state.py +523 -0
- data_engine/services/__init__.py +49 -0
- data_engine/services/daemon.py +64 -0
- data_engine/services/daemon_state.py +40 -0
- data_engine/services/flow_catalog.py +102 -0
- data_engine/services/flow_execution.py +48 -0
- data_engine/services/ledger.py +85 -0
- data_engine/services/logs.py +65 -0
- data_engine/services/runtime_binding.py +105 -0
- data_engine/services/runtime_execution.py +126 -0
- data_engine/services/runtime_history.py +62 -0
- data_engine/services/settings.py +58 -0
- data_engine/services/shared_state.py +28 -0
- data_engine/services/theme.py +59 -0
- data_engine/services/workspace_provisioning.py +224 -0
- data_engine/services/workspaces.py +74 -0
- data_engine/ui/__init__.py +3 -0
- data_engine/ui/cli/__init__.py +19 -0
- data_engine/ui/cli/app.py +161 -0
- data_engine/ui/cli/commands_doctor.py +178 -0
- data_engine/ui/cli/commands_run.py +80 -0
- data_engine/ui/cli/commands_start.py +100 -0
- data_engine/ui/cli/commands_workspace.py +97 -0
- data_engine/ui/cli/dependencies.py +44 -0
- data_engine/ui/cli/parser.py +56 -0
- data_engine/ui/gui/__init__.py +25 -0
- data_engine/ui/gui/app.py +116 -0
- data_engine/ui/gui/bootstrap.py +487 -0
- data_engine/ui/gui/bootstrapper.py +140 -0
- data_engine/ui/gui/cache_models.py +23 -0
- data_engine/ui/gui/control_support.py +185 -0
- data_engine/ui/gui/controllers/__init__.py +6 -0
- data_engine/ui/gui/controllers/flows.py +439 -0
- data_engine/ui/gui/controllers/runtime.py +245 -0
- data_engine/ui/gui/dialogs/__init__.py +12 -0
- data_engine/ui/gui/dialogs/messages.py +88 -0
- data_engine/ui/gui/dialogs/previews.py +222 -0
- data_engine/ui/gui/helpers/__init__.py +62 -0
- data_engine/ui/gui/helpers/inspection.py +81 -0
- data_engine/ui/gui/helpers/lifecycle.py +112 -0
- data_engine/ui/gui/helpers/scroll.py +28 -0
- data_engine/ui/gui/helpers/theming.py +87 -0
- data_engine/ui/gui/icons/dark_light.svg +12 -0
- data_engine/ui/gui/icons/documentation.svg +1 -0
- data_engine/ui/gui/icons/failed.svg +3 -0
- data_engine/ui/gui/icons/group.svg +4 -0
- data_engine/ui/gui/icons/home.svg +2 -0
- data_engine/ui/gui/icons/manual.svg +2 -0
- data_engine/ui/gui/icons/poll.svg +2 -0
- data_engine/ui/gui/icons/schedule.svg +4 -0
- data_engine/ui/gui/icons/settings.svg +2 -0
- data_engine/ui/gui/icons/started.svg +3 -0
- data_engine/ui/gui/icons/success.svg +3 -0
- data_engine/ui/gui/icons/view-log.svg +3 -0
- data_engine/ui/gui/icons.py +50 -0
- data_engine/ui/gui/launcher.py +48 -0
- data_engine/ui/gui/presenters/__init__.py +72 -0
- data_engine/ui/gui/presenters/docs.py +140 -0
- data_engine/ui/gui/presenters/logs.py +58 -0
- data_engine/ui/gui/presenters/runtime_projection.py +29 -0
- data_engine/ui/gui/presenters/sidebar.py +88 -0
- data_engine/ui/gui/presenters/steps.py +148 -0
- data_engine/ui/gui/presenters/workspace.py +39 -0
- data_engine/ui/gui/presenters/workspace_binding.py +75 -0
- data_engine/ui/gui/presenters/workspace_settings.py +182 -0
- data_engine/ui/gui/preview_models.py +37 -0
- data_engine/ui/gui/render_support.py +241 -0
- data_engine/ui/gui/rendering/__init__.py +12 -0
- data_engine/ui/gui/rendering/artifacts.py +95 -0
- data_engine/ui/gui/rendering/icons.py +50 -0
- data_engine/ui/gui/runtime.py +47 -0
- data_engine/ui/gui/state_support.py +193 -0
- data_engine/ui/gui/support.py +214 -0
- data_engine/ui/gui/surface.py +209 -0
- data_engine/ui/gui/theme.py +720 -0
- data_engine/ui/gui/widgets/__init__.py +34 -0
- data_engine/ui/gui/widgets/config.py +41 -0
- data_engine/ui/gui/widgets/logs.py +62 -0
- data_engine/ui/gui/widgets/panels.py +507 -0
- data_engine/ui/gui/widgets/sidebar.py +130 -0
- data_engine/ui/gui/widgets/steps.py +84 -0
- data_engine/ui/tui/__init__.py +5 -0
- data_engine/ui/tui/app.py +222 -0
- data_engine/ui/tui/bootstrap.py +475 -0
- data_engine/ui/tui/bootstrapper.py +117 -0
- data_engine/ui/tui/controllers/__init__.py +6 -0
- data_engine/ui/tui/controllers/flows.py +349 -0
- data_engine/ui/tui/controllers/runtime.py +167 -0
- data_engine/ui/tui/runtime.py +34 -0
- data_engine/ui/tui/state_support.py +141 -0
- data_engine/ui/tui/support.py +63 -0
- data_engine/ui/tui/theme.py +204 -0
- data_engine/ui/tui/widgets.py +123 -0
- data_engine/views/__init__.py +109 -0
- data_engine/views/actions.py +80 -0
- data_engine/views/artifacts.py +58 -0
- data_engine/views/flow_display.py +69 -0
- data_engine/views/logs.py +54 -0
- data_engine/views/models.py +96 -0
- data_engine/views/presentation.py +133 -0
- data_engine/views/runs.py +62 -0
- data_engine/views/state.py +39 -0
- data_engine/views/status.py +13 -0
- data_engine/views/text.py +109 -0
- py_data_engine-0.1.0.dist-info/METADATA +330 -0
- py_data_engine-0.1.0.dist-info/RECORD +200 -0
- py_data_engine-0.1.0.dist-info/WHEEL +5 -0
- py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
- py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# Flow Methods
|
|
2
|
+
|
|
3
|
+
This page covers the small author-facing `Flow` surface.
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from data_engine import Flow
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## `Flow(group)`
|
|
10
|
+
|
|
11
|
+
Create a new immutable flow definition.
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
flow = Flow(group="Claims")
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Rules:
|
|
18
|
+
|
|
19
|
+
- `group` must be a non-empty string
|
|
20
|
+
- the flow-module filename provides the flow identity
|
|
21
|
+
- the returned object is immutable, so each fluent call returns a new `Flow`
|
|
22
|
+
|
|
23
|
+
Immutability matters because it keeps authoring predictable. Each chained call produces a new flow definition rather than mutating hidden shared state.
|
|
24
|
+
|
|
25
|
+
## `watch(...)`
|
|
26
|
+
|
|
27
|
+
Configure a runtime trigger for manual, poll, or schedule execution.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
flow = flow.watch(
|
|
31
|
+
mode="poll",
|
|
32
|
+
source="../../example_data/Input/claims_flat",
|
|
33
|
+
interval="5s",
|
|
34
|
+
extensions=[".xlsx", ".xlsm"],
|
|
35
|
+
settle=1,
|
|
36
|
+
)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
flow = flow.watch(
|
|
41
|
+
mode="poll",
|
|
42
|
+
source="../../example_data/Settings/single_watch.xlsx",
|
|
43
|
+
interval="5s",
|
|
44
|
+
)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
flow = flow.watch(mode="schedule", run_as="batch", interval="15m")
|
|
49
|
+
flow = flow.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
|
|
50
|
+
flow = flow.watch(mode="schedule", run_as="batch", time="10:31", source="../../example_data/Settings/single_watch.xlsx")
|
|
51
|
+
flow = flow.watch(mode="schedule", run_as="batch", time=["08:15", "14:45"])
|
|
52
|
+
flow = flow.watch(mode="manual")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Rules:
|
|
56
|
+
|
|
57
|
+
- `mode` must be one of `manual`, `poll`, or `schedule`
|
|
58
|
+
- `run_as` defaults to `individual`
|
|
59
|
+
- `run_as="individual"` means one run per concrete source file
|
|
60
|
+
- `run_as="batch"` means one run at the watched root
|
|
61
|
+
- poll flows require `source=` and `interval=`
|
|
62
|
+
- schedule flows accept exactly one of `interval=` or `time=`
|
|
63
|
+
- `time` accepts either one `HH:MM` string or a collection of `HH:MM` strings
|
|
64
|
+
- `extensions` and `settle` are poll-only options
|
|
65
|
+
- missing or bad paths fail now and recover later when the path becomes valid
|
|
66
|
+
- poll freshness compares the current source file signature against the runtime ledger
|
|
67
|
+
|
|
68
|
+
Practical guidance:
|
|
69
|
+
|
|
70
|
+
- use `manual` for explicit button-driven flows
|
|
71
|
+
- use `poll` when the source changing should be the trigger
|
|
72
|
+
- use `schedule` when time should be the trigger
|
|
73
|
+
- use `run_as="batch"` when the flow should reason about a folder or root as one unit
|
|
74
|
+
- use `run_as="individual"` when each source file should become its own run
|
|
75
|
+
|
|
76
|
+
`watch(...)` is where you describe orchestration intent, not transformation logic.
|
|
77
|
+
|
|
78
|
+
## `mirror(root=...)`
|
|
79
|
+
|
|
80
|
+
Bind a mirrored output namespace rooted at one directory.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
flow = flow.mirror(root="../../example_data/Output/example_mirror")
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
`mirror(...)` does not write files. It defines the output namespace exposed later through `context.mirror`.
|
|
87
|
+
|
|
88
|
+
You can omit `mirror(...)` entirely if the flow has no need for a mirrored output namespace.
|
|
89
|
+
|
|
90
|
+
## `step(fn, use=None, save_as=None, label=None)`
|
|
91
|
+
|
|
92
|
+
Add one generic callable step.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
flow = flow.step(read_claims, save_as="raw_df")
|
|
96
|
+
flow = flow.step(clean_claims, use="raw_df", save_as="clean_df")
|
|
97
|
+
flow = flow.step(write_output, use="clean_df", label="Write Parquet")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Rules:
|
|
101
|
+
|
|
102
|
+
- `fn` must be callable
|
|
103
|
+
- `fn` must accept exactly one `context` parameter
|
|
104
|
+
- `use=` selects a previously saved object
|
|
105
|
+
- `save_as=` stores the returned object
|
|
106
|
+
- `label=` overrides the UI display name
|
|
107
|
+
|
|
108
|
+
The return value always becomes `context.current`.
|
|
109
|
+
|
|
110
|
+
This is the default workhorse method. Most flows are easiest to read when they are mostly made of `step(...)` with occasional `collect(...)` and `map(...)` where batching is truly needed.
|
|
111
|
+
|
|
112
|
+
## `map(fn, use=None, save_as=None, label=None)`
|
|
113
|
+
|
|
114
|
+
Map one callable across the current batch.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
flow = flow.collect([".pdf"])
|
|
118
|
+
flow = flow.map(validate_pdf)
|
|
119
|
+
flow = flow.map(validate_pdf_with_context, label="Validate Pdf")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
def validate_pdf(file_ref):
|
|
124
|
+
return {"name": file_ref.name, "ok": file_ref.exists()}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def validate_pdf_with_context(context, file_ref):
|
|
128
|
+
return {"flow": context.flow_name, "name": file_ref.name}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Rules:
|
|
132
|
+
|
|
133
|
+
- `map()` expects the current value to be iterable
|
|
134
|
+
- `fn` may accept either `(item)` or `(context, item)`
|
|
135
|
+
- the mapped results are returned as a `Batch`
|
|
136
|
+
- `map()` raises when the current batch is empty
|
|
137
|
+
- `use=`, `save_as=`, and `label=` work the same way they do for `step()`
|
|
138
|
+
|
|
139
|
+
Reach for `map(...)` when the same callable should run once per collected item. If the callable should reason about the whole collection, switch back to a normal `step(...)`.
|
|
140
|
+
|
|
141
|
+
## `step_each(fn, use=None, save_as=None, label=None)`
|
|
142
|
+
|
|
143
|
+
`step_each(...)` is an alias for `map(...)`.
|
|
144
|
+
|
|
145
|
+
Use whichever reads better in the flow module:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
flow = flow.map(read_claims)
|
|
149
|
+
flow = flow.step_each(read_claims)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## `collect(extensions, root=None, recursive=False, use=None, save_as=None, label=None)`
|
|
153
|
+
|
|
154
|
+
Collect matching files into a `Batch` of `FileRef` items.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
flow = flow.collect([".xlsx"])
|
|
158
|
+
flow = flow.collect([".pdf"], recursive=True)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Behavior:
|
|
162
|
+
|
|
163
|
+
- uses `root=` when provided
|
|
164
|
+
- otherwise falls back to `context.source.root`
|
|
165
|
+
- returns a `Batch`, not a raw list
|
|
166
|
+
- each item exposes `.name`, `.path`, `.stem`, `.suffix`, and `.parent`
|
|
167
|
+
|
|
168
|
+
If `root=` is omitted, the runtime falls back to the current source root. That is often the cleanest choice for poll or scheduled batch flows already bound to a source.
|
|
169
|
+
|
|
170
|
+
## `run_once()`
|
|
171
|
+
|
|
172
|
+
Run the flow one time and return the completed contexts.
|
|
173
|
+
|
|
174
|
+
Use this when you want a one-off Python-driven execution rather than continuous watching.
|
|
175
|
+
|
|
176
|
+
## `run()`
|
|
177
|
+
|
|
178
|
+
Start continuous execution for watched poll or schedule flows.
|
|
179
|
+
|
|
180
|
+
This is the entrypoint behind long-lived runtime behavior.
|
|
181
|
+
|
|
182
|
+
## `preview(use=None)`
|
|
183
|
+
|
|
184
|
+
Run one flow for notebook inspection and return a real object.
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
build().preview()
|
|
188
|
+
build().preview(use="raw_df").head(10)
|
|
189
|
+
build().preview(use="claim_frames")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Behavior:
|
|
193
|
+
|
|
194
|
+
- without `use=`, returns the final `context.current`
|
|
195
|
+
- with `use="name"`, runs only until `save_as="name"` exists
|
|
196
|
+
- returns the real saved object, so dataframe methods like `.head(10)` work naturally
|
|
197
|
+
- avoids running later write/debug steps once the requested saved object is available
|
|
198
|
+
- if a poll flow would have several startup source files, preview uses the first deterministic source candidate for notebook inspection rather than trying to preview every file at once
|
|
199
|
+
|
|
200
|
+
`preview(...)` is especially useful while authoring notebook-backed flows because it lets you stop at a meaningful intermediate instead of running the whole flow to the final writer step every time.
|
|
201
|
+
|
|
202
|
+
## `show()`
|
|
203
|
+
|
|
204
|
+
Preview the single current result from a one-off flow.
|
|
205
|
+
|
|
206
|
+
Use this for quick interactive inspection when the final current value itself is the thing you want to see.
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# Getting Started
|
|
2
|
+
|
|
3
|
+
This guide is for someone new to the code-defined Data Engine API and desktop app.
|
|
4
|
+
|
|
5
|
+
By the end, you should understand:
|
|
6
|
+
|
|
7
|
+
- what a flow is
|
|
8
|
+
- where flow modules live
|
|
9
|
+
- what a workspace contains
|
|
10
|
+
- how discovery and runtime execution work at a high level
|
|
11
|
+
- how to run a first flow end to end
|
|
12
|
+
- how batch workflows fit into the model
|
|
13
|
+
|
|
14
|
+
## The mental model
|
|
15
|
+
|
|
16
|
+
Data Engine has one source of truth for per-flow behavior: the `Flow` returned by `build()`.
|
|
17
|
+
|
|
18
|
+
In practice:
|
|
19
|
+
|
|
20
|
+
- the flow module defines the flow name, group, runtime mode, and ordered steps
|
|
21
|
+
- step functions do real work with native libraries such as Polars, DuckDB, and plain Python
|
|
22
|
+
- the desktop app discovers those flow modules inside the selected workspace and shows them as configurable runnable flows
|
|
23
|
+
|
|
24
|
+
The engine does not hide the real work behind a DSL. The fluent API owns orchestration, while the step callables own your actual business logic.
|
|
25
|
+
|
|
26
|
+
## The basic workspace layout
|
|
27
|
+
|
|
28
|
+
A typical authored workspace looks like this:
|
|
29
|
+
|
|
30
|
+
```text
|
|
31
|
+
workspaces/
|
|
32
|
+
example_workspace/
|
|
33
|
+
flow_modules/
|
|
34
|
+
flow_modules/flow_helpers/
|
|
35
|
+
config/
|
|
36
|
+
databases/
|
|
37
|
+
.workspace_state/
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
The parts you will usually author directly are:
|
|
41
|
+
|
|
42
|
+
- `flow_modules/`: runnable flows in `.py` or `.ipynb`
|
|
43
|
+
- `flow_modules/flow_helpers/`: reusable helper modules imported from flows
|
|
44
|
+
- `config/`: workspace-local TOML files available through `context.config`
|
|
45
|
+
- `databases/`: a conventional home for workspace-local databases used through `context.database(...)`
|
|
46
|
+
|
|
47
|
+
The app can provision that shape for you without overwriting existing content.
|
|
48
|
+
|
|
49
|
+
## Where flow module sources live
|
|
50
|
+
|
|
51
|
+
Flow module sources are authored in:
|
|
52
|
+
|
|
53
|
+
- `workspaces/<workspace_id>/flow_modules/<name>.ipynb`
|
|
54
|
+
- `workspaces/<workspace_id>/flow_modules/<name>.py`
|
|
55
|
+
|
|
56
|
+
Reusable helper modules live in:
|
|
57
|
+
|
|
58
|
+
- `workspaces/<workspace_id>/flow_modules/flow_helpers/<name>.py`
|
|
59
|
+
|
|
60
|
+
Compiled runtime modules are generated into machine-local artifacts rather than into the authored workspace itself.
|
|
61
|
+
|
|
62
|
+
Each flow module should export:
|
|
63
|
+
|
|
64
|
+
- optional `DESCRIPTION`
|
|
65
|
+
- `build() -> Flow`
|
|
66
|
+
|
|
67
|
+
Display titles come from `Flow(label=...)` when provided. Otherwise the UI derives them from the flow-module filename.
|
|
68
|
+
|
|
69
|
+
## Your first flow
|
|
70
|
+
|
|
71
|
+
A minimal scheduled flow can create data in memory and write it out:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from data_engine import Flow
|
|
75
|
+
import polars as pl
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_dates(context):
|
|
79
|
+
return pl.DataFrame({"day": [1, 2, 3]})
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def write_dates(context):
|
|
83
|
+
output = context.mirror.file("dates.parquet")
|
|
84
|
+
context.current.write_parquet(output)
|
|
85
|
+
return output
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def build():
|
|
89
|
+
return (
|
|
90
|
+
Flow(group="Reference")
|
|
91
|
+
.watch(mode="schedule", run_as="batch", interval="1h")
|
|
92
|
+
.mirror(root="../../example_data/Output/date_dimension")
|
|
93
|
+
.step(build_dates, save_as="dates_df")
|
|
94
|
+
.step(write_dates, use="dates_df", label="Write Parquet")
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
That example shows the full shape:
|
|
99
|
+
|
|
100
|
+
1. create `Flow(group=...)`
|
|
101
|
+
2. attach a runtime mode with `watch(...)`
|
|
102
|
+
3. optionally attach `mirror(...)`
|
|
103
|
+
4. add ordered `step(...)` callables
|
|
104
|
+
5. return the built flow from `build()`
|
|
105
|
+
|
|
106
|
+
The return value from each step becomes `context.current`, so later steps can keep operating on the current object or reach back to previously saved objects through `use=`.
|
|
107
|
+
|
|
108
|
+
## What the app actually does with that flow
|
|
109
|
+
|
|
110
|
+
Once the flow is discovered, the desktop app uses it for:
|
|
111
|
+
|
|
112
|
+
- grouping and labels in the home view
|
|
113
|
+
- deciding whether the flow is manual, poll, or schedule
|
|
114
|
+
- deciding whether the flow participates in the engine
|
|
115
|
+
- rendering step names and inspectable outputs
|
|
116
|
+
- manual runs and engine runs for the selected workspace
|
|
117
|
+
|
|
118
|
+
The app itself binds to one workspace at a time, so when you switch workspaces, the discovered flows, runtime ledger, daemon state, and visible runs all switch with it.
|
|
119
|
+
|
|
120
|
+
## A starter-style polling flow
|
|
121
|
+
|
|
122
|
+
This shape maps directly to starter flows such as `example_mirror` and `example_poll`:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from data_engine import Flow
|
|
126
|
+
import polars as pl
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def read_claims(context):
|
|
130
|
+
return pl.read_excel(context.source.path)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def keep_open(context):
|
|
134
|
+
return context.current.filter(pl.col("status") == "OPEN")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def write_target(context):
|
|
138
|
+
output = context.mirror.with_suffix(".parquet")
|
|
139
|
+
context.current.write_parquet(output)
|
|
140
|
+
return output
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def build():
|
|
144
|
+
return (
|
|
145
|
+
Flow(group="Claims")
|
|
146
|
+
.watch(
|
|
147
|
+
mode="poll",
|
|
148
|
+
source="../../example_data/Input/claims_dated",
|
|
149
|
+
interval="5s",
|
|
150
|
+
extensions=[".xlsx", ".xlsm"],
|
|
151
|
+
settle=1,
|
|
152
|
+
)
|
|
153
|
+
.mirror(root="../../example_data/Output/example_poll")
|
|
154
|
+
.step(read_claims, save_as="raw_df")
|
|
155
|
+
.step(keep_open, use="raw_df", save_as="filtered_df")
|
|
156
|
+
.step(write_target, use="filtered_df", label="Write Parquet")
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
This is a good first mental model for source-driven flows:
|
|
161
|
+
|
|
162
|
+
- `watch(...)` tells the runtime what to listen to
|
|
163
|
+
- `context.source` tells the step which concrete file is active
|
|
164
|
+
- `mirror(...)` defines where mirrored outputs belong
|
|
165
|
+
- returning the written path makes the result inspectable in the UI
|
|
166
|
+
|
|
167
|
+
## Batch-oriented files
|
|
168
|
+
|
|
169
|
+
When you want a folder of files as one runtime object, use `Flow.collect(...)` and either `Flow.map(...)` or `Flow.step_each(...)`.
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from data_engine import Flow
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def validate_pdf(file_ref):
|
|
176
|
+
return {"name": file_ref.name, "ok": file_ref.exists()}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def summarize_results(context):
|
|
180
|
+
return tuple(item["name"] for item in context.current if item["ok"])
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def build():
|
|
184
|
+
return (
|
|
185
|
+
Flow(group="Claims")
|
|
186
|
+
.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/pdfs")
|
|
187
|
+
.collect([".pdf"], save_as="pdf_files")
|
|
188
|
+
.map(validate_pdf, use="pdf_files", save_as="pdf_results")
|
|
189
|
+
.step(summarize_results, use="pdf_results")
|
|
190
|
+
)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
`Flow.collect(...)` returns a `Batch` of `FileRef` items.
|
|
194
|
+
|
|
195
|
+
`Flow.map(...)` runs one callable per item and returns a new `Batch`.
|
|
196
|
+
|
|
197
|
+
`Flow.step_each(...)` is the same operation with a name that can read more clearly in some flows.
|
|
198
|
+
|
|
199
|
+
If the batch is empty, both forms raise immediately. That behavior is intentional so batch flows fail loudly instead of silently producing ambiguous "nothing happened" results.
|
|
200
|
+
|
|
201
|
+
## Running flows from Python
|
|
202
|
+
|
|
203
|
+
Load one discovered flow:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from data_engine import load_flow
|
|
207
|
+
|
|
208
|
+
built = load_flow("example_poll")
|
|
209
|
+
results = built.run_once()
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Discover everything the workspace exposes:
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from data_engine import discover_flows, run
|
|
216
|
+
|
|
217
|
+
flows = discover_flows()
|
|
218
|
+
run(*flows)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Notebook-authored flows also support preview-oriented authoring:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
build().preview()
|
|
225
|
+
build().preview(use="raw_df")
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
That is often the fastest way to sanity-check a flow while you are still writing it.
|
|
229
|
+
|
|
230
|
+
For poll flows that watch a folder, `preview(...)` uses one deterministic startup source as a representative notebook preview rather than trying to run every discovered file.
|
|
231
|
+
|
|
232
|
+
## Manual, poll, and schedule at a glance
|
|
233
|
+
|
|
234
|
+
### Manual
|
|
235
|
+
|
|
236
|
+
- `watch(mode="manual")`
|
|
237
|
+
- `context.current` starts as `None`
|
|
238
|
+
- useful for ad hoc or UI-driven runs
|
|
239
|
+
- does not require a source binding
|
|
240
|
+
|
|
241
|
+
### Poll
|
|
242
|
+
|
|
243
|
+
- `watch(mode="poll", ...)`
|
|
244
|
+
- watches either one file or a directory of source files
|
|
245
|
+
- the first step receives the active source through `context.source`
|
|
246
|
+
- freshness compares the current source file signature against the runtime ledger
|
|
247
|
+
- `extensions=` and `settle=` only apply here
|
|
248
|
+
|
|
249
|
+
### Schedule
|
|
250
|
+
|
|
251
|
+
- `watch(mode="schedule", ...)`
|
|
252
|
+
- runs on an interval or on one or more wall-clock times
|
|
253
|
+
- supports one `time="HH:MM"` value or a collection of times
|
|
254
|
+
- often starts by building data in memory or loading from a known source root
|
|
255
|
+
|
|
256
|
+
## A few good habits early
|
|
257
|
+
|
|
258
|
+
- keep import-time code side-effect free
|
|
259
|
+
- keep expensive work inside steps, not at module import
|
|
260
|
+
- return output paths from writer steps when you want the UI `Inspect` action
|
|
261
|
+
- move reusable SQL, parsing helpers, and constants into `flow_modules/flow_helpers/`
|
|
262
|
+
- use `context.config` for workspace-local TOML configuration rather than inventing ad hoc config loading in every flow
|
|
263
|
+
- use `context.database(...)` when you want a conventional workspace-local database path
|
|
264
|
+
|
|
265
|
+
## Next steps
|
|
266
|
+
|
|
267
|
+
- Read [Core Concepts](core-concepts.md)
|
|
268
|
+
- Read [Authoring Flow Modules](authoring-flow-modules.md)
|
|
269
|
+
- Read [Flow Methods](flow-methods.md)
|
|
270
|
+
- Read [FlowContext](flow-context.md)
|
|
271
|
+
- Read [App Runtime and Workspaces](app-runtime-and-workspaces.md)
|