py-data-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engine/__init__.py +37 -0
- data_engine/application/__init__.py +39 -0
- data_engine/application/actions.py +42 -0
- data_engine/application/catalog.py +151 -0
- data_engine/application/control.py +213 -0
- data_engine/application/details.py +73 -0
- data_engine/application/runtime.py +449 -0
- data_engine/application/workspace.py +62 -0
- data_engine/authoring/__init__.py +14 -0
- data_engine/authoring/builder.py +31 -0
- data_engine/authoring/execution/__init__.py +6 -0
- data_engine/authoring/execution/app.py +6 -0
- data_engine/authoring/execution/context.py +82 -0
- data_engine/authoring/execution/continuous.py +176 -0
- data_engine/authoring/execution/grouped.py +106 -0
- data_engine/authoring/execution/logging.py +83 -0
- data_engine/authoring/execution/polling.py +135 -0
- data_engine/authoring/execution/runner.py +210 -0
- data_engine/authoring/execution/single.py +171 -0
- data_engine/authoring/flow.py +361 -0
- data_engine/authoring/helpers.py +160 -0
- data_engine/authoring/model.py +59 -0
- data_engine/authoring/primitives.py +430 -0
- data_engine/authoring/services.py +42 -0
- data_engine/devtools/__init__.py +3 -0
- data_engine/devtools/project_ast_map.py +503 -0
- data_engine/docs/__init__.py +1 -0
- data_engine/docs/sphinx_source/_static/custom.css +13 -0
- data_engine/docs/sphinx_source/api.rst +42 -0
- data_engine/docs/sphinx_source/conf.py +37 -0
- data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
- data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
- data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
- data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
- data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
- data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
- data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
- data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
- data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
- data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
- data_engine/docs/sphinx_source/guides/project-map.md +118 -0
- data_engine/docs/sphinx_source/guides/recipes.md +268 -0
- data_engine/docs/sphinx_source/index.rst +22 -0
- data_engine/domain/__init__.py +92 -0
- data_engine/domain/actions.py +69 -0
- data_engine/domain/catalog.py +128 -0
- data_engine/domain/details.py +214 -0
- data_engine/domain/diagnostics.py +56 -0
- data_engine/domain/errors.py +104 -0
- data_engine/domain/inspection.py +99 -0
- data_engine/domain/logs.py +118 -0
- data_engine/domain/operations.py +172 -0
- data_engine/domain/operator.py +72 -0
- data_engine/domain/runs.py +155 -0
- data_engine/domain/runtime.py +279 -0
- data_engine/domain/source_state.py +17 -0
- data_engine/domain/support.py +54 -0
- data_engine/domain/time.py +23 -0
- data_engine/domain/workspace.py +159 -0
- data_engine/flow_modules/__init__.py +1 -0
- data_engine/flow_modules/flow_module_compiler.py +179 -0
- data_engine/flow_modules/flow_module_loader.py +201 -0
- data_engine/helpers/__init__.py +25 -0
- data_engine/helpers/duckdb.py +705 -0
- data_engine/hosts/__init__.py +1 -0
- data_engine/hosts/daemon/__init__.py +23 -0
- data_engine/hosts/daemon/app.py +221 -0
- data_engine/hosts/daemon/bootstrap.py +69 -0
- data_engine/hosts/daemon/client.py +465 -0
- data_engine/hosts/daemon/commands.py +64 -0
- data_engine/hosts/daemon/composition.py +310 -0
- data_engine/hosts/daemon/constants.py +15 -0
- data_engine/hosts/daemon/entrypoints.py +97 -0
- data_engine/hosts/daemon/lifecycle.py +191 -0
- data_engine/hosts/daemon/manager.py +272 -0
- data_engine/hosts/daemon/ownership.py +126 -0
- data_engine/hosts/daemon/runtime_commands.py +188 -0
- data_engine/hosts/daemon/runtime_control.py +31 -0
- data_engine/hosts/daemon/server.py +84 -0
- data_engine/hosts/daemon/shared_state.py +147 -0
- data_engine/hosts/daemon/state_sync.py +101 -0
- data_engine/platform/__init__.py +1 -0
- data_engine/platform/identity.py +35 -0
- data_engine/platform/local_settings.py +146 -0
- data_engine/platform/theme.py +259 -0
- data_engine/platform/workspace_models.py +190 -0
- data_engine/platform/workspace_policy.py +333 -0
- data_engine/runtime/__init__.py +1 -0
- data_engine/runtime/file_watch.py +185 -0
- data_engine/runtime/ledger_models.py +116 -0
- data_engine/runtime/runtime_db.py +938 -0
- data_engine/runtime/shared_state.py +523 -0
- data_engine/services/__init__.py +49 -0
- data_engine/services/daemon.py +64 -0
- data_engine/services/daemon_state.py +40 -0
- data_engine/services/flow_catalog.py +102 -0
- data_engine/services/flow_execution.py +48 -0
- data_engine/services/ledger.py +85 -0
- data_engine/services/logs.py +65 -0
- data_engine/services/runtime_binding.py +105 -0
- data_engine/services/runtime_execution.py +126 -0
- data_engine/services/runtime_history.py +62 -0
- data_engine/services/settings.py +58 -0
- data_engine/services/shared_state.py +28 -0
- data_engine/services/theme.py +59 -0
- data_engine/services/workspace_provisioning.py +224 -0
- data_engine/services/workspaces.py +74 -0
- data_engine/ui/__init__.py +3 -0
- data_engine/ui/cli/__init__.py +19 -0
- data_engine/ui/cli/app.py +161 -0
- data_engine/ui/cli/commands_doctor.py +178 -0
- data_engine/ui/cli/commands_run.py +80 -0
- data_engine/ui/cli/commands_start.py +100 -0
- data_engine/ui/cli/commands_workspace.py +97 -0
- data_engine/ui/cli/dependencies.py +44 -0
- data_engine/ui/cli/parser.py +56 -0
- data_engine/ui/gui/__init__.py +25 -0
- data_engine/ui/gui/app.py +116 -0
- data_engine/ui/gui/bootstrap.py +487 -0
- data_engine/ui/gui/bootstrapper.py +140 -0
- data_engine/ui/gui/cache_models.py +23 -0
- data_engine/ui/gui/control_support.py +185 -0
- data_engine/ui/gui/controllers/__init__.py +6 -0
- data_engine/ui/gui/controllers/flows.py +439 -0
- data_engine/ui/gui/controllers/runtime.py +245 -0
- data_engine/ui/gui/dialogs/__init__.py +12 -0
- data_engine/ui/gui/dialogs/messages.py +88 -0
- data_engine/ui/gui/dialogs/previews.py +222 -0
- data_engine/ui/gui/helpers/__init__.py +62 -0
- data_engine/ui/gui/helpers/inspection.py +81 -0
- data_engine/ui/gui/helpers/lifecycle.py +112 -0
- data_engine/ui/gui/helpers/scroll.py +28 -0
- data_engine/ui/gui/helpers/theming.py +87 -0
- data_engine/ui/gui/icons/dark_light.svg +12 -0
- data_engine/ui/gui/icons/documentation.svg +1 -0
- data_engine/ui/gui/icons/failed.svg +3 -0
- data_engine/ui/gui/icons/group.svg +4 -0
- data_engine/ui/gui/icons/home.svg +2 -0
- data_engine/ui/gui/icons/manual.svg +2 -0
- data_engine/ui/gui/icons/poll.svg +2 -0
- data_engine/ui/gui/icons/schedule.svg +4 -0
- data_engine/ui/gui/icons/settings.svg +2 -0
- data_engine/ui/gui/icons/started.svg +3 -0
- data_engine/ui/gui/icons/success.svg +3 -0
- data_engine/ui/gui/icons/view-log.svg +3 -0
- data_engine/ui/gui/icons.py +50 -0
- data_engine/ui/gui/launcher.py +48 -0
- data_engine/ui/gui/presenters/__init__.py +72 -0
- data_engine/ui/gui/presenters/docs.py +140 -0
- data_engine/ui/gui/presenters/logs.py +58 -0
- data_engine/ui/gui/presenters/runtime_projection.py +29 -0
- data_engine/ui/gui/presenters/sidebar.py +88 -0
- data_engine/ui/gui/presenters/steps.py +148 -0
- data_engine/ui/gui/presenters/workspace.py +39 -0
- data_engine/ui/gui/presenters/workspace_binding.py +75 -0
- data_engine/ui/gui/presenters/workspace_settings.py +182 -0
- data_engine/ui/gui/preview_models.py +37 -0
- data_engine/ui/gui/render_support.py +241 -0
- data_engine/ui/gui/rendering/__init__.py +12 -0
- data_engine/ui/gui/rendering/artifacts.py +95 -0
- data_engine/ui/gui/rendering/icons.py +50 -0
- data_engine/ui/gui/runtime.py +47 -0
- data_engine/ui/gui/state_support.py +193 -0
- data_engine/ui/gui/support.py +214 -0
- data_engine/ui/gui/surface.py +209 -0
- data_engine/ui/gui/theme.py +720 -0
- data_engine/ui/gui/widgets/__init__.py +34 -0
- data_engine/ui/gui/widgets/config.py +41 -0
- data_engine/ui/gui/widgets/logs.py +62 -0
- data_engine/ui/gui/widgets/panels.py +507 -0
- data_engine/ui/gui/widgets/sidebar.py +130 -0
- data_engine/ui/gui/widgets/steps.py +84 -0
- data_engine/ui/tui/__init__.py +5 -0
- data_engine/ui/tui/app.py +222 -0
- data_engine/ui/tui/bootstrap.py +475 -0
- data_engine/ui/tui/bootstrapper.py +117 -0
- data_engine/ui/tui/controllers/__init__.py +6 -0
- data_engine/ui/tui/controllers/flows.py +349 -0
- data_engine/ui/tui/controllers/runtime.py +167 -0
- data_engine/ui/tui/runtime.py +34 -0
- data_engine/ui/tui/state_support.py +141 -0
- data_engine/ui/tui/support.py +63 -0
- data_engine/ui/tui/theme.py +204 -0
- data_engine/ui/tui/widgets.py +123 -0
- data_engine/views/__init__.py +109 -0
- data_engine/views/actions.py +80 -0
- data_engine/views/artifacts.py +58 -0
- data_engine/views/flow_display.py +69 -0
- data_engine/views/logs.py +54 -0
- data_engine/views/models.py +96 -0
- data_engine/views/presentation.py +133 -0
- data_engine/views/runs.py +62 -0
- data_engine/views/state.py +39 -0
- data_engine/views/status.py +13 -0
- data_engine/views/text.py +109 -0
- py_data_engine-0.1.0.dist-info/METADATA +330 -0
- py_data_engine-0.1.0.dist-info/RECORD +200 -0
- py_data_engine-0.1.0.dist-info/WHEEL +5 -0
- py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
- py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# Configuring Flows
|
|
2
|
+
|
|
3
|
+
Per-flow configuration lives in the fluent `Flow` chain, not in TOML.
|
|
4
|
+
|
|
5
|
+
That is an important design choice:
|
|
6
|
+
|
|
7
|
+
- the runtime shape of a flow belongs in the authored `Flow(...)` definition
|
|
8
|
+
- workspace-local TOML in `config/` is for step logic and runtime parameters consumed by your code
|
|
9
|
+
- there is no separate "expand this flow into several configured variants" layer after `build()`
|
|
10
|
+
|
|
11
|
+
## Core fields
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
Flow(group="Claims")
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
`group` is author-defined. The flow-module filename provides the flow identity.
|
|
18
|
+
|
|
19
|
+
Use `group` to cluster related flows in the UI and runtime model. A good rule of thumb is that a group should mean "these flows belong to the same operator-facing area of work."
|
|
20
|
+
|
|
21
|
+
## Watching
|
|
22
|
+
|
|
23
|
+
Single-file polling:
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
Flow(group="Settings").watch(
|
|
27
|
+
mode="poll",
|
|
28
|
+
source="../../example_data/Settings/single_watch.xlsx",
|
|
29
|
+
interval="5s",
|
|
30
|
+
).mirror(
|
|
31
|
+
root="../../example_data/Output/example_single_watch",
|
|
32
|
+
)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Directory polling:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
Flow(group="Claims").watch(
|
|
39
|
+
mode="poll",
|
|
40
|
+
source="../../example_data/Input/claims_flat",
|
|
41
|
+
interval="5s",
|
|
42
|
+
extensions=[".xlsx", ".xls", ".xlsm"],
|
|
43
|
+
settle=1,
|
|
44
|
+
).mirror(
|
|
45
|
+
root="../../example_data/Output/example_mirror",
|
|
46
|
+
)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Scheduled batch runs:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
Flow(group="Analytics").watch(
|
|
53
|
+
mode="schedule",
|
|
54
|
+
run_as="batch",
|
|
55
|
+
interval="15m",
|
|
56
|
+
source="../../example_data/Input/claims_flat",
|
|
57
|
+
).mirror(
|
|
58
|
+
root="../../example_data/Output/example_summary",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
Flow(group="Settings").watch(
|
|
62
|
+
mode="schedule",
|
|
63
|
+
run_as="batch",
|
|
64
|
+
time="10:31",
|
|
65
|
+
source="../../example_data/Settings/single_watch.xlsx",
|
|
66
|
+
).mirror(
|
|
67
|
+
root="../../example_data/Output/example_schedule",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
Flow(group="Settings").watch(
|
|
71
|
+
mode="schedule",
|
|
72
|
+
run_as="batch",
|
|
73
|
+
time=["08:15", "14:45"],
|
|
74
|
+
source="../../example_data/Settings/single_watch.xlsx",
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
What watching owns:
|
|
79
|
+
|
|
80
|
+
- source selection
|
|
81
|
+
- ledger-backed source freshness tracking
|
|
82
|
+
- extension filtering for poll directory sources
|
|
83
|
+
- settle/debounce behavior for poll flows
|
|
84
|
+
- whether runtime executes per file or as one root-level batch via `run_as=`
|
|
85
|
+
|
|
86
|
+
What watching does not own:
|
|
87
|
+
|
|
88
|
+
- dataframe reads
|
|
89
|
+
- dataframe transforms
|
|
90
|
+
- database work
|
|
91
|
+
- output writing
|
|
92
|
+
|
|
93
|
+
That separation is what keeps `watch(...)` readable. It tells the engine when and why to run, not how to do the underlying data work.
|
|
94
|
+
|
|
95
|
+
`watch(mode="schedule", ...)` accepts exactly one of:
|
|
96
|
+
|
|
97
|
+
- `interval="10m"`
|
|
98
|
+
- `time="HH:MM"`
|
|
99
|
+
- `time=["08:15", "14:45"]`
|
|
100
|
+
|
|
101
|
+
It may also bind an optional `source=...` path for recurring jobs.
|
|
102
|
+
|
|
103
|
+
### `run_as`
|
|
104
|
+
|
|
105
|
+
`run_as` controls what the runtime treats as one unit of work.
|
|
106
|
+
|
|
107
|
+
Common values are:
|
|
108
|
+
|
|
109
|
+
- `run_as="individual"`: one run per concrete source file
|
|
110
|
+
- `run_as="batch"`: one run at the watched root
|
|
111
|
+
|
|
112
|
+
Use `individual` when each source file should be processed independently.
|
|
113
|
+
|
|
114
|
+
Use `batch` when the flow should reason about the watched source as one collection, such as "all current workbooks in this folder."
|
|
115
|
+
|
|
116
|
+
### Poll-specific options
|
|
117
|
+
|
|
118
|
+
`extensions=` limits which files in a polled directory participate in freshness checks and execution.
|
|
119
|
+
|
|
120
|
+
`settle=` adds debounce behavior so the engine does not immediately react to a file that is still being written by another process.
|
|
121
|
+
|
|
122
|
+
## Mirror bindings
|
|
123
|
+
|
|
124
|
+
Use `mirror(root=...)` when a flow needs source-relative output routing.
|
|
125
|
+
|
|
126
|
+
Inside steps:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
context.mirror.with_suffix(".parquet")
|
|
130
|
+
context.mirror.file("summary.json")
|
|
131
|
+
context.mirror.namespaced_file("open_claims.parquet")
|
|
132
|
+
context.mirror.root_file("analytics.duckdb")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
`mirror(...)` does not write files. It only defines the output namespace available at runtime.
|
|
136
|
+
|
|
137
|
+
If a flow has no natural mirrored outputs, you do not need `mirror(...)`.
|
|
138
|
+
|
|
139
|
+
If a flow writes several related outputs, `mirror(...)` is usually the cleanest way to keep them organized without scattering path math through your steps.
|
|
140
|
+
|
|
141
|
+
## Batch workflows
|
|
142
|
+
|
|
143
|
+
Use `collect(...)` and `map(...)` or `step_each(...)` together for folder-oriented processing:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
Flow(group="Analytics") \
|
|
147
|
+
.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat") \
|
|
148
|
+
.collect([".xlsx"], save_as="claim_files") \
|
|
149
|
+
.map(read_claims, use="claim_files", save_as="claim_frames")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
`map(...)` is the per-item stage in that pipeline, and `step_each(...)` is the equivalent alias. Both raise immediately when the batch is empty.
|
|
153
|
+
|
|
154
|
+
This is the standard batch shape:
|
|
155
|
+
|
|
156
|
+
1. watch a directory or scheduled source root
|
|
157
|
+
2. collect matching files into a `Batch`
|
|
158
|
+
3. map one callable across each file
|
|
159
|
+
4. switch back to `step(...)` once you want to reason about the combined result
|
|
160
|
+
|
|
161
|
+
## Configuring step labels and saved objects
|
|
162
|
+
|
|
163
|
+
Flow configuration also includes the names and labels you assign in the chain.
|
|
164
|
+
|
|
165
|
+
Examples:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
(
|
|
169
|
+
Flow(group="Claims")
|
|
170
|
+
.step(read_claims, save_as="raw_df")
|
|
171
|
+
.step(clean_claims, use="raw_df", save_as="clean_df")
|
|
172
|
+
.step(write_output, use="clean_df", label="Write Parquet")
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Those fields affect the authoring experience directly:
|
|
177
|
+
|
|
178
|
+
- `save_as=` creates stable names for later steps and notebook previews
|
|
179
|
+
- `use=` loads one of those saved names into `context.current`
|
|
180
|
+
- `label=` controls the display name in the UI
|
|
181
|
+
|
|
182
|
+
If you are deciding where a piece of information belongs:
|
|
183
|
+
|
|
184
|
+
- if it shapes orchestration, put it in the `Flow` chain
|
|
185
|
+
- if it shapes step logic, put it in your code or in `context.config`
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Core Concepts
|
|
2
|
+
|
|
3
|
+
## Flow
|
|
4
|
+
|
|
5
|
+
A `Flow` is an immutable definition with:
|
|
6
|
+
|
|
7
|
+
- `group`
|
|
8
|
+
- an optional trigger via `watch(...)`
|
|
9
|
+
- an optional mirrored output binding via `mirror(...)`
|
|
10
|
+
- ordered generic steps
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from data_engine import Flow
|
|
14
|
+
|
|
15
|
+
flow = Flow(group="Claims")
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
The flow-module filename is the flow identity used for discovery and runtime bookkeeping. `group` is the author-controlled grouping visible in the UI.
|
|
19
|
+
|
|
20
|
+
## Runtime modes
|
|
21
|
+
|
|
22
|
+
Manual:
|
|
23
|
+
|
|
24
|
+
- no trigger configured
|
|
25
|
+
- `run_once()` executes the steps once with `context.current = None`
|
|
26
|
+
- useful for button-driven operator runs or preview-oriented flows
|
|
27
|
+
|
|
28
|
+
Poll:
|
|
29
|
+
|
|
30
|
+
- source-driven execution over either one file or a directory of files
|
|
31
|
+
- the runtime compares the current source file signature against the persisted runtime ledger
|
|
32
|
+
- the first step sees the active input through `context.source`
|
|
33
|
+
- startup backlog handling is based on persisted ledger state for each source version
|
|
34
|
+
- intermediate saved objects do not participate in staleness checks
|
|
35
|
+
|
|
36
|
+
Schedule:
|
|
37
|
+
|
|
38
|
+
- interval-driven via `watch(mode="schedule", interval="15m")`
|
|
39
|
+
- or wall-clock via `watch(mode="schedule", time="10:31")`
|
|
40
|
+
- `time` may also be a collection such as `["08:15", "14:45"]`
|
|
41
|
+
- may optionally bind a `source=...` path for recurring jobs
|
|
42
|
+
|
|
43
|
+
The distinction between poll and schedule is important:
|
|
44
|
+
|
|
45
|
+
- poll is source freshness driven
|
|
46
|
+
- schedule is time driven
|
|
47
|
+
|
|
48
|
+
You can combine scheduled execution with a source binding when the flow should run on a schedule but still read from a known source root.
|
|
49
|
+
|
|
50
|
+
## Step
|
|
51
|
+
|
|
52
|
+
Each `step(...)` is one callable:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
def step(context) -> object:
|
|
56
|
+
...
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
The return value always becomes `context.current`.
|
|
60
|
+
|
|
61
|
+
This is the main design boundary:
|
|
62
|
+
|
|
63
|
+
- the fluent API orchestrates runtime behavior
|
|
64
|
+
- native libraries perform the actual data and file work
|
|
65
|
+
|
|
66
|
+
That means Data Engine is intentionally not trying to replace Polars, DuckDB, pathlib, or your Python helper code. It coordinates them.
|
|
67
|
+
|
|
68
|
+
## Saved objects
|
|
69
|
+
|
|
70
|
+
Steps can save and reuse values:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
(
|
|
74
|
+
Flow(group="Docs")
|
|
75
|
+
.step(read_claims, save_as="raw_df")
|
|
76
|
+
.step(clean_claims, use="raw_df", save_as="clean_df")
|
|
77
|
+
.step(write_output, use="clean_df")
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
- `use="name"` loads `context.objects["name"]` into `context.current`
|
|
82
|
+
- `save_as="name"` stores the returned value into `context.objects["name"]`
|
|
83
|
+
|
|
84
|
+
In notebooks, those saved names are also the easiest way to inspect intermediates:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
build().preview(use="clean_df").head(10)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
This is one of the most useful parts of the authoring model:
|
|
91
|
+
|
|
92
|
+
- `current` gives you the current object in the pipeline
|
|
93
|
+
- `objects` gives you stable named waypoints
|
|
94
|
+
|
|
95
|
+
That makes it easy to structure flows around a few explicit intermediate states rather than one long opaque chain.
|
|
96
|
+
|
|
97
|
+
## Batch mapping
|
|
98
|
+
|
|
99
|
+
`collect(...)` and `map(...)` are the batch-oriented authoring tools.
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def read_claims(file_ref):
|
|
103
|
+
return pl.read_excel(file_ref.path)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def combine_claims(context):
|
|
107
|
+
return pl.concat(context.current, how="vertical_relaxed")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
flow = (
|
|
111
|
+
Flow(group="Analytics")
|
|
112
|
+
.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
|
|
113
|
+
.collect([".xlsx"], save_as="claim_files")
|
|
114
|
+
.map(read_claims, use="claim_files", save_as="claim_frames")
|
|
115
|
+
.step(combine_claims, use="claim_frames")
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Use `map(...)` when the same callable should run once per batch item instead of once per whole flow. `map(...)` raises immediately when the batch is empty.
|
|
120
|
+
|
|
121
|
+
Batch mapping is especially useful when you want to:
|
|
122
|
+
|
|
123
|
+
- read many files into many dataframes
|
|
124
|
+
- validate one file at a time
|
|
125
|
+
- emit one lightweight record per source item before combining
|
|
126
|
+
|
|
127
|
+
Use a normal `step(...)` when the callable should reason about the batch as a whole.
|
|
128
|
+
|
|
129
|
+
## Source and mirror namespaces
|
|
130
|
+
|
|
131
|
+
The runtime exposes two structured path namespaces:
|
|
132
|
+
|
|
133
|
+
- `context.source`
|
|
134
|
+
- `context.mirror`
|
|
135
|
+
|
|
136
|
+
Examples:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
context.source.path
|
|
140
|
+
context.source.with_extension(".json")
|
|
141
|
+
context.source.with_suffix(".json")
|
|
142
|
+
context.source.file("notes.json")
|
|
143
|
+
context.source.namespaced_file("notes.json")
|
|
144
|
+
context.source.root_file("lookup.csv")
|
|
145
|
+
|
|
146
|
+
context.mirror.with_extension(".parquet")
|
|
147
|
+
context.mirror.with_suffix(".parquet")
|
|
148
|
+
context.mirror.file("open_claims.parquet")
|
|
149
|
+
context.mirror.namespaced_file("open_claims.parquet")
|
|
150
|
+
context.mirror.root_file("analytics.duckdb")
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
`context.source` resolves read-side paths. `context.mirror` resolves write-ready output paths.
|
|
154
|
+
|
|
155
|
+
The important difference is:
|
|
156
|
+
|
|
157
|
+
- `source` is about where the active input lives
|
|
158
|
+
- `mirror` is about where outputs for that input should go
|
|
159
|
+
|
|
160
|
+
That lets you keep path logic readable and source-aware without hand-building relative paths in every step.
|
|
161
|
+
|
|
162
|
+
Examples of common patterns:
|
|
163
|
+
|
|
164
|
+
- read a sidecar file beside the current source with `context.source.file("notes.json")`
|
|
165
|
+
- write one mirrored parquet beside the source shape with `context.mirror.with_suffix(".parquet")`
|
|
166
|
+
- write multiple outputs for the same source with `context.mirror.namespaced_file(...)`
|
|
167
|
+
- write a stable root-level artifact such as a snapshot or DuckDB file with `context.mirror.root_file(...)`
|
|
168
|
+
|
|
169
|
+
## Discovery
|
|
170
|
+
|
|
171
|
+
The desktop UI and Python entrypoints discover flows from compiled flow modules.
|
|
172
|
+
|
|
173
|
+
Each discovered flow module contributes:
|
|
174
|
+
|
|
175
|
+
- a module name
|
|
176
|
+
- optional `DESCRIPTION`
|
|
177
|
+
- `build() -> Flow`
|
|
178
|
+
|
|
179
|
+
The flow-module filename/module name is the flow identity surfaced in discovery and execution. The UI uses `Flow.label` when present, otherwise it derives a readable title from that internal name.
|
|
180
|
+
|
|
181
|
+
That discovered `Flow` object is what the UI inspects for:
|
|
182
|
+
|
|
183
|
+
- grouping
|
|
184
|
+
- step labels
|
|
185
|
+
- runtime mode
|
|
186
|
+
- source and mirror bindings
|
|
187
|
+
|
|
188
|
+
The app does not maintain a second hidden config layer that mutates flow behavior after discovery. The authored `Flow` is the real contract the runtime and UI are looking at.
|
|
189
|
+
|
|
190
|
+
## Workspaces
|
|
191
|
+
|
|
192
|
+
Flows do not exist in isolation. They are discovered from the currently selected authored workspace.
|
|
193
|
+
|
|
194
|
+
An authored workspace typically contains:
|
|
195
|
+
|
|
196
|
+
- `flow_modules/`
|
|
197
|
+
- `flow_modules/flow_helpers/`
|
|
198
|
+
- `config/`
|
|
199
|
+
- `databases/`
|
|
200
|
+
|
|
201
|
+
The desktop app binds to one workspace at a time. When the selected workspace changes, the app reloads:
|
|
202
|
+
|
|
203
|
+
- discovered flows
|
|
204
|
+
- local runtime state
|
|
205
|
+
- daemon control state
|
|
206
|
+
- visible runs and logs
|
|
207
|
+
|
|
208
|
+
For the control and state model behind that, see [App Runtime and Workspaces](app-runtime-and-workspaces.md).
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Database Methods
|
|
2
|
+
|
|
3
|
+
There is no first-class database sub-chain. Use DuckDB directly inside step callables, usually with a workspace-local database path from `context.database(...)`.
|
|
4
|
+
|
|
5
|
+
If you want common warehouse-style shortcuts, see [DuckDB Helpers](duckdb-helpers.md). That helper layer covers several repeated patterns without taking over general SQL authoring.
|
|
6
|
+
|
|
7
|
+
That is intentional. The core API deliberately avoids hiding connection ownership, transactions, or query semantics behind a special fluent DSL.
|
|
8
|
+
|
|
9
|
+
In practice, that means:
|
|
10
|
+
|
|
11
|
+
- Data Engine gives you a conventional path
|
|
12
|
+
- your step opens and closes the database connection
|
|
13
|
+
- normal DuckDB and Python rules apply
|
|
14
|
+
|
|
15
|
+
## `context.database(...)`
|
|
16
|
+
|
|
17
|
+
`context.database(name)` returns a path beneath the current authored workspace's `databases/` folder.
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
context.database("analytics.duckdb")
|
|
23
|
+
context.database("claims/analytics.duckdb")
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Those resolve to:
|
|
27
|
+
|
|
28
|
+
- `workspaces/<workspace_id>/databases/analytics.duckdb`
|
|
29
|
+
- `workspaces/<workspace_id>/databases/claims/analytics.duckdb`
|
|
30
|
+
|
|
31
|
+
Rules:
|
|
32
|
+
|
|
33
|
+
- the path must be relative
|
|
34
|
+
- parent directories are created automatically
|
|
35
|
+
- the helper is only available for authored workspace flows
|
|
36
|
+
- it does not create a connection for you
|
|
37
|
+
|
|
38
|
+
That last point is important. Returning the path avoids hidden connection lifetime issues and keeps the behavior obvious.
|
|
39
|
+
|
|
40
|
+
## Example
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import duckdb
|
|
44
|
+
import polars as pl
|
|
45
|
+
|
|
46
|
+
from data_engine import Flow
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_claims(file_ref):
|
|
50
|
+
return pl.read_excel(file_ref.path)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def build_source(context):
|
|
54
|
+
return pl.concat(context.current, how="vertical_relaxed")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def summarize(context):
|
|
58
|
+
conn = duckdb.connect(context.database("analytics.duckdb"))
|
|
59
|
+
try:
|
|
60
|
+
conn.register("input", context.current)
|
|
61
|
+
return conn.sql(
|
|
62
|
+
"""
|
|
63
|
+
select workflow, count(*) as row_count
|
|
64
|
+
from input
|
|
65
|
+
group by workflow
|
|
66
|
+
"""
|
|
67
|
+
).pl()
|
|
68
|
+
finally:
|
|
69
|
+
conn.close()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def build():
|
|
73
|
+
return (
|
|
74
|
+
Flow(group="Analytics")
|
|
75
|
+
.watch(
|
|
76
|
+
mode="schedule",
|
|
77
|
+
run_as="batch",
|
|
78
|
+
interval="15m",
|
|
79
|
+
source="../../example_data/Input/claims_flat",
|
|
80
|
+
)
|
|
81
|
+
.mirror(root="../../example_data/Output/example_summary")
|
|
82
|
+
.collect([".xlsx"], save_as="claim_files")
|
|
83
|
+
.map(read_claims, use="claim_files", save_as="claim_frames")
|
|
84
|
+
.step(build_source, use="claim_frames", save_as="raw_df")
|
|
85
|
+
.step(summarize, use="raw_df", save_as="summary_df")
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
This keeps the flow API small while still letting flow modules use native SQL and native DuckDB connections.
|
|
90
|
+
|
|
91
|
+
## Good patterns
|
|
92
|
+
|
|
93
|
+
- open the connection inside the step that needs it
|
|
94
|
+
- close the connection in `finally:`
|
|
95
|
+
- keep the path stable when you want incremental or append-oriented databases
|
|
96
|
+
- use subfolders such as `claims/analytics.duckdb` when one workspace owns several related databases
|
|
97
|
+
|
|
98
|
+
## A note on mirror vs database paths
|
|
99
|
+
|
|
100
|
+
If the database is a durable workspace-local asset, prefer `context.database(...)`.
|
|
101
|
+
|
|
102
|
+
If the database is really just another output artifact produced by one mirrored source flow, `context.mirror.root_file("analytics.duckdb")` can still be appropriate.
|
|
103
|
+
|
|
104
|
+
The difference is mostly semantic:
|
|
105
|
+
|
|
106
|
+
- `context.database(...)` says "this belongs to the workspace as a local database"
|
|
107
|
+
- `context.mirror...` says "this belongs to this flow's output namespace"
|